def search_for_departments(keywords): keyw = keywords.split() dept_ids = [] try: conn = lite.connect(db) conn.row_factory = lite.Row with conn: for k in keyw: k_stemmed = stemmer.stem(k) print(k_stemmed) cur = conn.cursor() cur.execute("SELECT id FROM departments where name like ?", ("%" + k_stemmed + "%", )) rows = cur.fetchall() if len(rows) > 0: print("found in courses.........") for row in rows: dept_ids.append(row[0]) count_keywords = len(keyw) common_ids = {x: dept_ids.count(x) for x in dept_ids} out = [] for x in common_ids: if common_ids[x] == count_keywords: cur.execute("SELECT name FROM departments where id = ?", (x, )) rows = cur.fetchone() out.append(str(x) + ":" + rows[0]) return out except lite.Error as e: print('error openning table departments', e) return []
def search_for_departments(keywords): keyw = keywords.split() dept_ids = [] try: conn = lite.connect(db) conn.row_factory = lite.Row with conn: for k in keyw: k_stemmed = stemmer.stem(k) print( k_stemmed) cur = conn.cursor() cur.execute("SELECT id FROM departments where name like ?", ("%"+k_stemmed+"%",)) rows = cur.fetchall() if len(rows)>0: print ("found in courses.........") for row in rows: dept_ids.append(row[0]) count_keywords = len(keyw) common_ids = {x:dept_ids.count(x) for x in dept_ids} out =[] for x in common_ids: if common_ids[x] == count_keywords: cur.execute("SELECT name FROM departments where id = ?", (x,)) rows = cur.fetchone() out.append(str(x)+":"+rows[0]) return out except lite.Error as e: print('error openning table departments', e) return []
def pre_process(self, uid, tokens): # adding into active terms before stemming self.active_terms.append((self.timestamp, tokens, uid)) while len(self.active_terms) > 0: term = self.active_terms[0] if term[0] < self.timestamp - _ACTIVE_WINDOW_SIZE * 60: self.active_terms.popleft() else: break # stemming tokens = map(lambda x: stemmer.stem(x), tokens) if len(tokens) < 1: return None # hashing results = [] # (counts, reserved_slot, n_words, h) for h in range(fast_hashing.HASH_NUMBER): results.append(({}, {}, len(tokens), h)) for token in tokens: hash_code = np.array(fast_hashing.hash_code(token)) % _SKETCH_BUCKET_SIZE for h in range(fast_hashing.HASH_NUMBER): code = hash_code[h] if code in results[h][0]: results[h][0][code] += 1 else: results[h][0][code] = 1 return results
def checkword(word, words, firstword=False, context=''): if word in words or stemmer.stem(word) in words: return True if len(word) > 25: return False if firstword and (str(word[0]).upper() + word[1:] in words): return True print("Found a word that wasn't recognized: ", word, ", in the line: ") print(re.sub(word, word.upper(), context), end='') print("We're looking for close matches to this word. Please wait...") dist = 0 editdists = [10] * 5 wordlist = {0: '', 1: '', 2: '', 3: '', 4: ''} longest = max(editdists) index = 1 for line in words: '''Stuff to make it faster!''' '''Match upper/lowercase''' if not firstword and word[0].isupper() != line[0].isupper(): continue '''Don't allow words that are too long or too short from the dictionary''' if len(line) - 3 > len(word) or len(word) - 3 > len(line): continue '''Randomly check if some letters are contained in both, only for long enough words. This is the part that really gets the time down to the order of seconds, rather than minutes.''' if len(word) > 3: count = 0 for x in range(int(len(word) / 4)): if word[int(random.random() * len(word))] in line: count += 1 if count < (int(len(word) / 4)) - int(4 / len(word)): continue '''If we get through all that, then calculate the min edit distance''' #ignore case if it's the first word if firstword: dist = mineditdist(line.lower(), word.lower()) else: dist = mineditdist(line, word) '''Saves the word if it's in the top 5''' if dist < longest: index = editdists.index(longest) editdists.remove(longest) wordlist[index] = line editdists.insert(index, dist) longest = max(editdists) '''Ordering the list of words that are the closest to the source word''' returnlist = [] while len(returnlist) < 5: for val in editdists: if len(returnlist) == 5: break if val == min(editdists): returnlist.append(wordlist.get(editdists.index(val))) wordlist.pop(editdists.index(val), 0) editdists[editdists.index(val)] = 11 print(returnlist) return returnlist
def search(self, query, isPhrase, isOrMatch): results = [] stemmed = [stem(t) for t in query.split(" ")] if (isPhrase): results = self.phraseSearch(stemmed) else: results = self.termSearch(stemmed, isOrMatch) for doc in self.removeNailPolish(results): self.printResult(doc)
def load_text(): opinions = {} with codecs.open('opinie1', 'r', encoding='utf-8') as my_file: for line in my_file: pair = line.split(";", 2) key = pair[1] key = split_to_words(key) key = stemmer.stem(key) key = " ".join(key) value = pair[0] opinions[key] = float(value) return opinions
def bagOfWords(s, words): bag = [0 for _ in range(len(words))] s_words = nltk.word_tokenize(s) s_words = [stemmer.stem(word.lower()) for word in s_words] for se in s_words: for i, w in enumerate(words): if w == se: bag[i] = 1 return numpy.array(bag)
def analyse_topics(self, _probs): words = set() for term in self.active_terms: for word in term[1]: words.add(word) print "size of words:", len(words) high_prob_words = [] for _word in words: word = stemmer.stem(_word) hash_code = np.array(fast_hashing.hash_code(word)) % _SKETCH_BUCKET_SIZE min_prob_list = [] for h in range(fast_hashing.HASH_NUMBER): prob = _probs[h][hash_code[h]] min_prob_list.append(prob) min_prob_list.sort() min_prob = min_prob_list[1] # !!! if min_prob >= _PROBABILITY_THRESHOLD: high_prob_words.append((word, min_prob)) # rescale s_prob = sum([p for w, p in high_prob_words]) high_prob_words = [(w, p/s_prob) for w, p in high_prob_words] high_prob_words.sort(key=lambda x: x[1], reverse=True) # top 20 high_prob_words = high_prob_words[:20] post_res = postprocessor.process(high_prob_words, self.active_terms) if eval(config.get('output', 'debug_info')): self.output.write('high_prob_words\n') self.output.write(str(high_prob_words)) #debugging self.output.write('\npost_res\n') self.output.write(str(post_res)) #debugging self.output.write('\n') flag, word_level_results, _ = post_res if flag: event = dict() event['detection_time'] = str(datetime.utcfromtimestamp(self.timestamp)) event_words = list() for prob_word, word_flag in zip(high_prob_words, word_level_results): _word = prob_word[0] if word_flag: event_words.append(_word) event['key_words'] = event_words self.output.write(json.dumps(event)) self.output.write('\n')
def normalizeText(self, text): text = text.lower() text = re.sub(r'[^0-9a-zA-Z]+', ' ', text) articleWords = text.split() articleWords = self.removeStopWords(articleWords) stemmedWords = [] for word in articleWords: stemmed = stemmer.stem(word) # p = stemmer.PorterStemmer() # stemmed = p.stemWord(word) self.reverseStemHashtable[stemmed] = word stemmedWords.append(stemmed) return stemmedWords
def index_stem(id, doc): terms = doc.split() for term in terms: term = term.lower() term = clean(term) term = stemmer.stem(term, 0, len(term) - 1) doc_ids = inverted_index.get(term) if doc_ids: doc_ids.add(id) else: inverted_index[term] = set() inverted_index[term].add(id)
def process_word_cloud(word_cloud): latin_numbers = "1234567890IIIVXIVIΙΙΙ" symbols = "[]():-!?&,'//" #create a list of stopwords stop_words = [] with open('greek_stop_words.txt', 'r') as fo: for line in fo: w = line.strip() if w[-1] == ',': w = w[:-1] w = greek_to_upper(w) if w not in stop_words: stop_words.append(w) for x in symbols: word_cloud = word_cloud.replace(x, ' ') wc = word_cloud.split(' ') wc_up = [] for w in wc: wc_up.append(greek_to_upper(w)) new_wc = [] for w in wc_up: if w not in stop_words and w not in latin_numbers and len(w) > 2: new_wc.append(w) else: print("stop word eliminated:", w) print(len(new_wc)) print(new_wc) stems_list = {} for w in new_wc: st_w = stemmer.stem(w) print(w, st_w) if st_w not in stems_list: stems_list[st_w] = [w] else: stems_list[st_w].append(w) for x in stems_list: print(x, stems_list[x]) word_frequencies = {} for x in stems_list: # find the shortest member of the corresponding list for the stem word_frequencies[min(stems_list[x], key=len)] = len(stems_list[x]) #for x in word_frequencies: print (x.lower(), word_frequencies[x]) # new_text ="" # for x in word_frequencies: # new_text += (" "+x)*word_frequencies[x] # print(new_text.lower()) # return new_text.lower() print(word_frequencies) #input() return word_frequencies
def process_word_cloud(word_cloud): latin_numbers = "1234567890IIIVXIVIΙΙΙ" symbols ="[]():-!?&,'//" #create a list of stopwords stop_words = [] with open('greek_stop_words.txt', 'r') as fo: for line in fo: w = line.strip() if w[-1] == ',': w = w[:-1] w = greek_to_upper(w) if w not in stop_words: stop_words.append(w) for x in symbols: word_cloud = word_cloud.replace(x,' ') wc = word_cloud.split(' ') wc_up=[] for w in wc: wc_up.append(greek_to_upper(w)) new_wc = [] for w in wc_up: if w not in stop_words and w not in latin_numbers and len(w)>2: new_wc.append(w) else : print ("stop word eliminated:", w) print(len(new_wc)) print(new_wc) stems_list ={} for w in new_wc: st_w = stemmer.stem(w) print (w, st_w) if st_w not in stems_list: stems_list[st_w]=[w] else: stems_list[st_w].append(w) for x in stems_list: print (x, stems_list[x]) word_frequencies={} for x in stems_list: # find the shortest member of the corresponding list for the stem word_frequencies[min(stems_list[x], key=len)]= len ( stems_list[x]) #for x in word_frequencies: print (x.lower(), word_frequencies[x]) # new_text ="" # for x in word_frequencies: # new_text += (" "+x)*word_frequencies[x] # print(new_text.lower()) # return new_text.lower() print (word_frequencies) #input() return word_frequencies
def preprocessing(self, text): """ Replace the unusual character in the text """ to_replace = [ '!', '#', '%', '$', "'", '&', ')', '(', '+', '*', '-', ',', '/', '.', '1', '0', '3', '2', '5', '4', '7', '6', '9', '8', ';', ':', '?', '_', '^', ] lowered = text.encode('ascii', 'ignore').lower() replacing = lowered for char_to_replace in to_replace: replacing = replacing.replace(char_to_replace, ' ' + char_to_replace + ' ') stemming = ' ' splited = replacing.split() # return replacing return stemming.join([stem(item) for item in splited])
def main(use_tfidf, opinion_text): """Print rating of given text """ opinion = " ".join(stemmer.stem( create_vectors.split_to_words(opinion_text.decode('utf-8')))) trainset = create_vectors.load_text() if use_tfidf: keywords_file = 'tfidf_keywords' else: keywords_file = 'keywords' regression_file = 'regr_for_{}'.format(keywords_file) with codecs.open(keywords_file, 'r', encoding='utf-8') as kfile: keywords = json.load(kfile) regr = get_regression_from_file(regression_file) rating = get_rating(opinion, regr, trainset, keywords, use_tfidf) if rating < 0.0: rating = 0.0 elif rating > 5.0: rating = 5.0 print '{:.2f}'.format(rating)
def getData(company, amount, datef, datet): news_dates, news, news_count = downloadNews(company, amount) writeNews(news_dates, news, news_count, path + 'news' + sep + '{}.csv'.format(company)) #news_dates, news, news_count = readNews(path + 'news' + sep + '{}.csv'.format(company)) stocks_dates, stocks, stocks_count = downloadStock(company, datef, datet) writeStock(stocks_dates, stocks, stocks_count, path + 'stocks' + sep + '{}.csv'.format(company)) #stocks_dates, stocks, stocks_count = readStock(path + 'stocks' + sep + '{}.csv'.format(company)) stems_dates, stems, stems_count = stem(news_dates, news, news_count) writeNews(stems_dates, stems, stems_count, path + 'stems' + sep + '{}.csv'.format(company)) #stems_dates, stems, stems_count = readNews(path + 'stems' + sep + '{}.csv'.format(company)) connections_dates, connections_news, connections_stocks, connections_count = connect( stems_dates, stems, stems_count, stocks_dates, stocks, stocks_count) writeConnections(connections_dates, connections_news, connections_stocks, connections_count, path + 'connections' + sep + '{}.csv'.format(company))
def process(self, _ptweet): self.timestamp = _ptweet.timestamp _tokens = _ptweet.tokens tokens = [stemmer.stem(x) for x in _tokens] if len(tokens) < 3: return None, None unique_words = set(tokens) unique_word_pairs = set() for i in unique_words: for j in unique_words - {i}: # To us [a, b] = [b, a], and sorting gives us a distinct representation. unique_word_pairs.add(tuple(sorted([i, j]))) max_sig = 0 max_sig_instance = None sig_list = list() for token in unique_word_pairs: if _SIGNI_TYPE == 's': min_instance = [] scores, codes = self.sig_scorers.get(token, self.timestamp) for x in scores: min_instance.append(x.observe(int(self.timestamp), 1.0)) count, ewma, ewmavar, sig = min(min_instance, key=lambda x: x[1]) # count, ewma, ewmavar, sig = min([x.observe(int(self.timestamp), 1.0) for x in self.sig_scorers.get(token, self.timestamp)],key=lambda x:x[1]) if sig > max_sig and ewma > 0: max_sig = sig max_sig_instance = _ptweet.datetime( ), count, ewma, ewmavar, sig, token if sig > _SIGNI_THRESHOLD and ewma > 0: sig_list.append( (_ptweet.datetime(), count, ewma, ewmavar, sig, token)) if max_sig > _SIGNI_THRESHOLD: # print(max_sig_instance) return max_sig_instance, sig_list return None, None
def search_stem(tokens): prev_doc_ids = set() accumulate = or_comp for token in tokens: token = token.lower() token = stemmer.stem(token, 0, len(token) - 1) if operators.get(token[0:2]): accumulate = operators[token[0:2]] #print 'operators', accumulate token = token[2:] doc_ids = inverted_index.get(token) #print token, '=', doc_ids if doc_ids: doc_ids = accumulate(doc_ids, prev_doc_ids) #print accumulate, '=', doc_ids prev_doc_ids = set(doc_ids) l = list(doc_ids) l.sort() print '\t', tokens, '-->', l
def analyse_topics(self, _probs): words = set() for term in self.active_terms: for word in term[1]: words.add(word) print "size of words:", len(words) high_prob_words = [] for _word in words: word = stemmer.stem(_word) hash_code = np.array(fast_hashing.hash_code(word)) % _SKETCH_BUCKET_SIZE min_prob_list = [] for h in range(fast_hashing.HASH_NUMBER): prob = _probs[h][hash_code[h]] min_prob_list.append(prob) min_prob_list.sort() min_prob = min_prob_list[1] # !!! if min_prob >= _PROBABILITY_THRESHOLD: high_prob_words.append((word, min_prob, hash_code)) high_prob_words.sort(key=lambda x: x[1], reverse=True) high_prob_words = high_prob_words[:_MAX_NUMBER_WORDS] print high_prob_words _kws = list() _kps = list() post_result = postprocessor.process(high_prob_words, self.active_terms) print post_result if not post_result[0]: return _event = dict() _id = event_output.getId() _event['eid'] = _id _event['topicID'] = _id _event['info.dtime'] = str(datetime.datetime.utcfromtimestamp(self.timestamp)) ''' for high_prob_word in high_prob_words: _kws.append(high_prob_word[0]) _kps.append(high_prob_word[1])''' word_level_result = post_result[1] for i in range(len(high_prob_words)): high_prob_word = high_prob_words[i] if word_level_result[i]: _kws.append(high_prob_word[0]) _kps.append(high_prob_word[1]) _event['info.keywords'] = _kws _event['info.probs'] = _kps _event['info.numUsers'] = post_result[3] _event['info.numGeoUsers'] = 0 _event['info.numTweets'] = post_result[2] _event['info.numGeoTweets'] = 0 event_output.put(_id, _event)
if len(sys.argv) < 2: print("Invalid argument count") sys.exit() else: if len(sys.argv) >= 3: bgColor = sys.argv[2] if len(sys.argv) == 4: maskFile = "py/masks/" + sys.argv[3] + ".jpg" outFile = sys.argv[1] inp = open("py/text/" + outFile + ".txt", encoding="utf-8", mode="r") text = inp.read() inp.close() text = re.sub('[\W_]+', ' ', text) splits = [x for x in text.split(' ') if (not x.isspace() and x)] stemsplits = stemmer.stem(" ".join(splits)).split(' ') dicta = {} dicts = defaultdict(int) dictst = defaultdict(int) dictfull = {} for x in range(0, len(splits)): dicta[splits[x]] = stemsplits[x] for x in range(0, len(splits)): dicts[splits[x]] += 1 for x in range(0, len(stemsplits)): dictst[stemsplits[x]] += 1 sorted_d = sorted(dicts.items(), key=operator.itemgetter(1)) brr = len(splits) for w in range(0, len(sorted_d)): if dicta[sorted_d[w][0]] not in dictfull: dictfull[dicta[sorted_d[w][0]]] = (sorted_d[w][0],
"knew": "knew", "knick": "knick", "knif": "knif", "knife": "knife", "knight": "knight", "knightly": "knight", "knights": "knight", "knit": "knit", "knits": "knit", "knitted": "knit", "knitting": "knit", "knives": "knive", "knob": "knob", "knobs": "knob", "knock": "knock", "knocked": "knock", "knocker": "knocker", "knockers": "knocker", "knocking": "knock", "knocks": "knock", "knopp": "knopp", "knot": "knot", "knots": "knot", } for original in test_cases: stemmed_term = stem(original) expected_stem = test_cases[original] error_msg = "stemmed %s to %s, expected value %s" % (original, stemmed_term, expected_stem) assert expected_stem == stemmed_term, error_msg
def checkword(word, words, firstword = False, context = ''): if word in words or stemmer.stem(word) in words: return True if len(word)>25: return False if firstword and (str(word[0]).upper()+word[1:] in words): return True print("Found a word that wasn't recognized: ", word, ", in the line: ") print(re.sub(word, word.upper(), context), end = '') print("We're looking for close matches to this word. Please wait...") dist = 0 editdists = [10]*5 wordlist = {0:'',1:'',2:'',3:'',4:''} longest = max(editdists) index = 1 for line in words: '''Stuff to make it faster!''' '''Match upper/lowercase''' if not firstword and word[0].isupper() != line[0].isupper(): continue '''Don't allow words that are too long or too short from the dictionary''' if len(line)-3 > len(word) or len(word)-3 > len(line): continue '''Randomly check if some letters are contained in both, only for long enough words. This is the part that really gets the time down to the order of seconds, rather than minutes.''' if len(word) > 3: count = 0 for x in range(int(len(word)/4)): if word[int(random.random()*len(word))] in line: count += 1 if count < (int(len(word)/4)) - int(4/len(word)): continue '''If we get through all that, then calculate the min edit distance''' #ignore case if it's the first word if firstword: dist = mineditdist(line.lower(), word.lower()) else: dist = mineditdist(line, word) '''Saves the word if it's in the top 5''' if dist < longest: index = editdists.index(longest) editdists.remove(longest) wordlist[index] = line editdists.insert(index, dist) longest = max(editdists) '''Ordering the list of words that are the closest to the source word''' returnlist = [] while len(returnlist)<5: for val in editdists: if len(returnlist) == 5: break if val == min(editdists): returnlist.append(wordlist.get(editdists.index(val))) wordlist.pop(editdists.index(val),0) editdists[editdists.index(val)] = 11 print(returnlist) return returnlist
import sys from triples import ParseTriples, Triple import keyvalue.sqliteKVStore as sqliteKVS import stemmer as s imagesStore = sqliteKVS.SqliteKeyValue("images.db") labelsStore = sqliteKVS.SqliteKeyValue("labels.db") if (len(sys.argv) < 2): print("Es necesario indicar la o las palabras a buscar Ejemplo:") print("{0} palabra1".format(sys.argv[0])) for word in sys.argv[1:]: w = s.stem(word) newword = labelsStore.getItem(w) print(newword) #if len(word) > 0: # print(imagesStore.getItem(word[0][0])) #@TODO Aqui debemos programar la logica de buscar las URLs #asociadas a cada palabra que nos den via la linea de comandos. imagesStore.close() labelsStore.close()
dictionary[image.getSubject()] = image.getObject() print(image.getSubject() + " -- " + image.getObject()) image = imagesDS.getNext() for key, value in dictionary.items(): h = 0 imagsDyna.putItem(key, {"S": value}) for i in range(0, 5000): label = labelsDS.getNext() #if len(labelsDyna.getItem(label.getSubject())) > 0: stemmer = s.stem(label.getObject()) #Note that label could have mutiple values so iterate thorough the list for word in stemmer.split(" "): if label.getSubject() in dictionary: if word in dictionaryLabels: dictionaryLabels[word].append({"S": label.getSubject()}) else: dictionaryLabels[word] = [{"S": label.getSubject()}] print(word + " is asociated with " + label.getObject() + " " + label.getSubject()) #termsStore.putItem(key=word, value=label.getSubject()) for key, value in dictionaryLabels.items(): h = 0 labelsDyna.putItem(key, {"L": value})
def stemmed_words(doc): return (mystem.stem(w) for w in analyzer(doc))
def correctSentence(sentence, index): taggedS = tagger.applyMLTag(sentence) word = taggedS[index][0] POStag = taggedS[index][1] stemList = stemmer.stem(word, POStag) #remove duplicates verifiedWords = [] for s in stemList: #("found stem "+str(s)) tags = tagger.getTagsForWord(s[0]) if len(tags) > 0: #print("stem added") verifiedWords += [s] #at this point, verifiedWords should contain only real words if len(verifiedWords) == 0: #print("No verified words.") return "No Answer" replacementWord = "" #print "Entering while loop" while (replacementWord == "" and len(verifiedWords) > 0): #find the shortest word/root root = verifiedWords[0] numVerifiedLeft = len(verifiedWords) for w in verifiedWords: if len(w[0]) <= len(root[0]): root = w #print("shortest word is "+str(root)) #possibles should contain all words that can contain the root possibles = tagger.getWordsWithRoot(root[0]) if (root[0][-1] == 'e'): possibles += tagger.getWordsWithRoot(root[0][:-1]) elif (root[0][-1] == 'y'): possibles += tagger.getWordsWithRoot(root[0][:-1] + 'i') for row in stemmer.csvReader("irregularPastVerbs.csv"): if (row[0] == root[0]): possibles += tagger.getWordsWithRoot(row[1]) possibles += tagger.getWordsWithRoot(row[2]) #print("possibles for "+str(root)+" are "+str(possibles)) #actualPossibles should contain all words that can be stemmed to the root possibles.sort(key=lambda x: len(x[0]), reverse=False) possibles = possibles[:40] actualPossibles = [] for word in possibles: if (stemmer.isRootOfWord(root[0], root[1], word[0], word[1])): actualPossibles += [word] print("actual possibles for " + str(root) + " are " + str(actualPossibles)) prevWord = "" if index > 0: prevWord = sentence[index - 1] nextWord = "" if index < len(sentence) - 1: nextWord = sentence[index + 1] replacementWord = MLWordUsingBigrams(prevWord, nextWord, actualPossibles) #print("replacement word found for root "+str(root)+" is "+replacementWord) # verifiedWords.remove(root) numVerifiedLeft -= 1 if (numVerifiedLeft == 0 and replacementWord == ""): #print("No good replacements found. Cry now.") return "No Answer" #print("We highly reccomend that you replace your word with "+replacementWord) #print("Your sentence would then become:") sentence[index] = replacementWord newSentence = "" for w in sentence: newSentence += (w + " ") print newSentence return sentence
border = int(connections_count * 0.75) training_dates = total_dates[:border] training_news = total_news_sequence[:border] training_stocks = total_stocks[:border] training_count = border testing_dates = total_dates[border:] testing_news = total_news_sequence[border:] testing_stocks = total_stocks[border:] testing_count = total_count - border total_X = numpy.array(total_news_sequence) total_y = numpy.array(total_stocks) training_X = numpy.array(training_news) training_y = numpy.array(training_stocks) testing_X = numpy.array(testing_news) testing_y = numpy.array(testing_stocks) if sys.argv[1] == '-f': fit(company, training_X, training_y, testing_X, testing_y) else: news_dates, news, news_count = readNews(path + predict_path) stems_dates, stems, stems_count = stem(news_dates, news, news_count) news_sequences = sequence.pad_sequences( sequences=tokenizer.texts_to_sequences(stems)) y = predict(news_sequences, company) print(y)