def test_repeated_keywords(self): text = self._get_text_from_test_data("testrepeatedkeywords.txt") kwds = keywords(text) self.assertTrue(len(kwds.splitlines())) kwds_u = keywords(utils.to_unicode(text)) self.assertTrue(len(kwds_u.splitlines())) kwds_lst = keywords(text, split=True) self.assertTrue(len(kwds_lst))
def test_keywords_runs(self): text = self._get_text_from_test_data("mihalcea_tarau.txt") kwds = keywords(text) self.assertTrue(len(kwds.splitlines())) kwds_u = keywords(utils.to_unicode(text)) self.assertTrue(len(kwds_u.splitlines())) kwds_lst = keywords(text, split=True) self.assertTrue(len(kwds_lst))
def test_keywords_ratio(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: text = f.read() # Check ratio parameter is well behaved. Because length is taken on tokenized clean text # we just check that ratio 20% is twice as long as ratio 10% # Values of 10% and 20% were carefully selected for this test to avoid # numerical instabilities when several keywords have almost the same score selected_docs_12 = keywords(text, ratio=0.1, split=True) selected_docs_21 = keywords(text, ratio=0.2, split=True) self.assertAlmostEqual(float(len(selected_docs_21)) / len(selected_docs_12), float(21) / 12, places=1)
def test_keywords_runs(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt")) as f: text = f.read() kwds = keywords(text) self.assertTrue(len(kwds.splitlines())) kwds_u = keywords(utils.to_unicode(text)) self.assertTrue(len(kwds_u.splitlines())) kwds_lst = keywords(text, split=True) self.assertTrue(len(kwds_lst))
def results(): # get data URLS = ['https://www.binance.com/en', 'http://www.supermap.com'] ATTRIBUTES = ['description', 'keywords', 'Description', 'Keywords'] collected_data = [] res = [] data = request.form['command'] # .............................................. URLS = [data] for url in URLS: entry = {'url': url} try: r = requests.get(url) except Exception as e: res = 'Could not load page {}. Reason: {}'.format(url, str(e)) print('Could not load page {}. Reason: {}'.format(url, str(e))) return render_template('results.html', predictions=res) continue if r.status_code == 200: soup = BeautifulSoup(r.content, 'html.parser') meta_list = soup.find_all("meta") for meta in meta_list: if 'name' in meta.attrs.keys() and meta.attrs['name'].strip( ).lower() in ['description', 'keywords']: name = meta.attrs['name'] entry[name.lower()] = meta.attrs['content'] # if len(entry) == 3: collected_data.append(entry) # else: # print('Could not find all required attributes for URL {}'.format(url)) # res = 'Could not find all required attributes for URL {}'.format(url) # return render_template('results.html',predictions=res) else: print('Could not load page {}.Reason: {}'.format( url, r.status_code)) res = 'Could not load page {}.Reason: {}'.format( url, r.status_code) return render_template('results.html', predictions=res) print('Collected meta attributes (TODO - push to DB):') for entry in collected_data: print(entry) print("Summary ") # Textrank method print(keywords(str(entry)).split('\n')) print('\n') # KeyBERT method from keybert import KeyBERT model = KeyBERT('distilbert-base-nli-mean-tokens') print( model.extract_keywords(str(entry), keyphrase_ngram_range=(1, 2), stop_words=None)) print('\n') res = model.extract_keywords(str(entry), keyphrase_ngram_range=(1, 2), stop_words=None) return res
def extract_keywords(self, text): """ Extracts keyword from given text based on gensim word scoring. Set ratio to 0.1 to return highest scoring keywords. return a list of keywords. """ return keywords(text, split=True, ratio=0.1)
def SummarizerAndkeyworder(text): #convert text into string format[explicit] text = str(text) print('\n\n#Summary:\n\n') summary=summarize(text,ratio=0.1) print(summary) print('\n\n#keywords\n\n') print(keywords(text,ratio=0.1))
def textrank(self): string = " " stop_words = set(stopwords.words('english')) for sent in self.sents: for token in sent: if token not in stop_words: string += " " + token return keywords(string).split('\n')
def keyWords_Labels_Matching(Country,gallery_id): DocList ,Data = Load_GalLery_Textual_Data(Country,gallery_id) S1 ,Data1 = Load_GoogleVision_Labels(Country,gallery_id) data_lemmatized = [w for doc in PrepareData(DocList) for w in doc] print (data_lemmatized) fullStr = ' '.join(data_lemmatized) #labels = [Preprocessing(x['label']) for x in S1[0]] #labels.append(Preprocessing(S1[1])) labels = [w for label in PrepareData(S1) for w in label] setA = list(set(labels)) setB = keywords(fullStr).split('\n') setB = [w for docs in PrepareData(setB) for w in docs] overlap = 0 for l in setA: for w in setB: if fuzz.ratio(l, w) >= 75: overlap += 1 universe = [] uni = list(set(setA) | set(setB)) for i in range(len(uni)): if uni[i] not in universe: universe.append(uni[i]) for j in range(i+1,len(uni)): if fuzz.ratio(uni[i], uni[j]) >= 75 and uni[j] not in universe: universe.append(uni[j]) universe = len(universe) labels = round(float(overlap) / len(setA) * 100., 2) comments = round(float(overlap) / len(setB) * 100., 2) overall = round(float(overlap) / float(universe) * 100., 2) #print ('overlap = ',overlap) #print ('universe = ',universe) #print ('\nLabels = ',len(setA)) #print ('Comments = ',len(setB)) #print ('overlap(Labels,Comments)/Labels = ',labels) #print ('overlap(Labels,Comments)/Comments = ',comments) print ('overlap(Labels,Comments)/Universe(Labels,Comments) = ',overall) return labels,comments,overall,setA,setB
def crwallNews(): req = requests.get('https://www.reuters.com/news/world') req.encoding = 'utf-8' title = [] title_kor = [] keyword = [] keyword_kor = [] summary = [] summary_kor = [] upload_day = [] href = [] html = req.text soup = BeautifulSoup(html, 'html.parser') posts = soup.select('.story-content a ') r = Rake() for i in posts: if 'href' in i.attrs: plain_title = i.get_text().replace("\t", "").replace("\n", "") plain_href = 'https://www.reuters.com/news/world' + str( i.attrs['href']) # 본문 크롤링 bsObject = BeautifulSoup(plain_href, "html.parser") body = bsObject.find_all('p', 'ArticleBody-para-TD_9x') bodyText = [] # 본문 for i in body: bodyText.append(i) bodyText = str(bodyText) bodyText = re.sub('<.+?>', '', bodyText, 0, re.I | re.S) # 태그 제거 # 키워드 추출 문 r.extract_keywords_from_text(bodyText) # 본문의 키워드 추출 words = r.get_ranked_phrases() keyword.append(''.join(keywords( words[0:3]).split('\n'))) # 총 3개 키워드 삽입 # summary.append(이거 요약) 요약문인가? 이거 어캐쓰는거임 href.append(plain_href) title.append(plain_title) title_kor.append(''.join(eng2kr(i.get_text()))) upload_day.append(datetime.datetime.utcnow()) latest = pd.DataFrame({ "href": href, "title": title, "title_kor": title_kor, "upload_day": upload_day }) latest = latest.fillna(0) latest = latest[latest['title'].isin(findMongo()) == False] print(latest) latest.reset_index(inplace=True) data_dict = latest.to_dict("records") print(data_dict) return data_dict
def get_keywords(content): try: keys = keywords(content, words=10, split='\n', pos_filter=('NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN'), lemmatize=True) except ZeroDivisionError: keys = [] except IndexError as e: keys = keywords(content, ratio=1, split='\n', pos_filter=('NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN'), lemmatize=True) return keys
def return_keywords(texts): xkeywords = [] values = keywords(text=preprocess(texts), split='\n', scores=True) for x in values[:10]: xkeywords.append(x[0]) try: return xkeywords except: return "no content"
def post(): posted_data = request.get_json() text = posted_data['text'] text = (keywords(text)) return jsonify({ 'Keywords': text })
def txtsummarise(): txt = text.get('1.0', END) text1.delete('1.0', END) text1.insert(END, summarize(txt, ratio=0.3)) text2.delete('1.0', END) text2.insert(END, keywords(txt, ratio=0.3))
def many_keywords_w2v(text): '''Iterates over the words in one question: extracts the keywords, their scores, and their word vectors.''' keyword_list = keywords(text, ratio=0.2, words=None, split=False, scores=True, pos_filter=None, lemmatize=True, deacc=True) keyword_list_w2v = [] for keyword ,score in keyword_list: word_vector = one_keyword_w2v(keyword) if type(word_vector) != int: keyword_list_w2v.append([keyword, score, word_vector]) return keyword_list_w2v
def extract_keyphrase_list(text_string,ratio=0.5,min_phrase_length=1): keyword_list = keywords(text_string, ratio=ratio, split=True, scores=True,lemmatize=True) scale = 25.0 filtered_keyword_list = list([]) for keyword_tuple in keyword_list: keyword = keyword_tuple[0].lower().strip() if len(keyword.split(' ')) >= min_phrase_length: filtered_keyword_list.append(keyword) return filtered_keyword_list
def getKeywords(text): nltk.download('wordnet') tokenized = nltk.tokenize.word_tokenize(text) stemmer = SnowballStemmer("english", ignore_stopwords=False) rawkws = summary.keywords(text) keyws = rawkws.split() return keyws
def keywords_reshape(searchText, data_dict): """ Return a list of overarching topics """ running_kwds=[] for k,v in data_dict.items(): running_kwds = running_kwds + keywords(v, split=True) return list(set(running_kwds))
def get_keywords_textrank(text): text_keys = keywords(text, ratio=1, lemmatize=True, scores=True, split=True, pos_filter=()) text_keys = [tup[0] for tup in text_keys[:10]] return text_keys
def gen_keyword_list(self, para_list, ratio=1): keyword_list = [] keyword_ratio = ratio print('generating default keyword...') for para in tqdm(para_list): line = re.sub("<unk>\s?", '', para) keyword_list.append(keywords(line, ratio=keyword_ratio, split=True)) return keyword_list
def test_keywords_ratio(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') with utils.open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: text = f.read() # Check ratio parameter is well behaved. Because length is taken on tokenized clean text # we just check that ratio 20% is twice as long as ratio 10% # Values of 10% and 20% were carefully selected for this test to avoid # numerical instabilities when several keywords have almost the same score selected_docs_12 = keywords(text, ratio=0.1, split=True) selected_docs_21 = keywords(text, ratio=0.2, split=True) self.assertAlmostEqual(float(len(selected_docs_21)) / len(selected_docs_12), float(21) / 12, places=1)
def _prepare_keywords(self, filepath=None): if filepath is None: filepath = MSG_FILE text = '' for line in get_lines(filepath): if '?' in line: text += line self.kwds = set(keywords(text).split())
def get_speech(): global out driver.get("https://www.moneycontrol.com/annual-report/" + company_name+"/directors-report/"+cd+"#"+cd) # director_speech director_speech = driver.find_element_by_xpath( '//div[@class="report_data"]').text director_speech d = re.match('.*\\n', director_speech).group() # ds = director_speech.rstrip("\n") # ds=re.sub('\n',' ',director_speech) ds = re.sub(d, ' ', director_speech) driver.get("https://www.moneycontrol.com/annual-report/" + company_name+"/chairmans-speech/"+cd+"#"+cd) chairman_speech = driver.find_element_by_xpath( '//div[@class="report_data"]').text c = re.match('.*\\n', chairman_speech).group() # cs=re.sub('\n',' ',chairman_speech) cs = re.sub(c, " ", chairman_speech) ds_keyword_list = keywords(ds, words=20, split=True, lemmatize=True) cs_keyword_list = keywords(cs, words=20, split=True, lemmatize=True) # keywords from whole chairman's speech # print(keyword_list) ds_keyword_tags = dict(nltk.pos_tag(ds_keyword_list)) cs_keyword_tags = dict(nltk.pos_tag(cs_keyword_list)) ds_keywords_final = [ word for word in ds_keyword_tags.keys() if ds_keyword_tags[word] == 'NN'] cs_keywords_final = [ word for word in cs_keyword_tags.keys() if cs_keyword_tags[word] == 'NN'] # also need to remove company name if there in the list # print(keywords_final[:5]) # summarization ds_summ = summarize(ds, word_count=100) cs_summ = summarize(cs, word_count=100) out = json.dumps([{'summary': cs_summ, 'keywords': cs_keywords_final[:5], 'fullCont':cs}, { 'summary': ds_summ, 'keywords': ds_keywords_final[:5], 'fullCont':ds}]) return out
def crwallNews(): req = requests.get('https://www.reuters.com/news/world') req.encoding = 'utf-8' title = [] title_kor = [] keyword = [] keyword_kor = [] summary = [] summary_kor = [] upload_day = [] r = Rake() href = [] html = req.text soup = BeautifulSoup(html, 'html.parser') posts = soup.select('.story-content a ') for i in posts: if 'href' in i.attrs: plain_title = i.get_text().replace("\t", "").replace("\n", "") plain_href = 'https://www.reuters.com/news/world' + str( i.attrs['href']) plain_text = crwallbody(plain_href) # summary r.extract_keywords_from_text(plain_text) summary_temp = ','.join(r.get_ranked_phrases()[:3]) summary.append(summary_temp) summary_kor.append(''.join(eng2kr(summary_temp))) # keyword keyword_temp = ','.join(keywords(plain_text).split('\n')[:3]) keyword.append( keyword_temp.replace('reuters', '').replace('news', '').replace('provider', '')) keyword_kor.append(''.join(eng2kr(keyword_temp))) href.append(plain_href) title.append(plain_title) title_kor.append(''.join(eng2kr(i.get_text()))) upload_day.append(datetime.datetime.utcnow()) latest = pd.DataFrame({ 'href': href, 'title': title, 'title_kor': title_kor, 'summary': summary, 'keyword': keyword, 'keyword_kor': keyword_kor, 'summary_kor': summary_kor, 'upload_day': upload_day, 'plain_text': plain_text }) latest = latest.fillna(0) latest = latest[latest['title'].isin(findMongo()) == False] print(latest) latest.reset_index(inplace=True) data_dict = latest.to_dict("records") print(data_dict) return data_dict
def test_text_keywords_words(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: text = f.read() # calculate exactly 13 keywords generated_keywords = keywords(text, words=15, split=True) self.assertEqual(len(generated_keywords), 16)
def test_text_summarization_raises_exception_on_short_input_text(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') with utils.smart_open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f: text = f.read() # Keeps the first 8 sentences to make the text shorter. text = "\n".join(text.split('\n')[:8]) self.assertTrue(keywords(text) is not None)
def get_keywords(self,text,ratio): # Bu methodda ise, gensim kütüphanesinin anahtar kelime çıkarım mekanizması kullanılarak, # metindeki en önemki stop word olmayan kelimelerin bulunmasını hedefledik x = self.cleanText(text) text_keywords = keywords(text,ratio=ratio).split("\n") valid_keywords = [] for keyword in text_keywords: if keyword not in self.stop_words: valid_keywords.append(keyword) return valid_keywords
def analyze(message): keyword = '' summary = '' alert = '' Polarity = '' Subjectivity = '' if len(message) < 2: alert = 'enter text more than 2 character' print(alert) elif len(message) > 2 and len(message) < 100: overview = TextBlob(message) Polarity = round(overview.sentiment.polarity, 2) Polarity = str(Polarity) Subjectivity = round(overview.sentiment.subjectivity, 2) Subjectivity = str(Subjectivity) elif len(message) > 100 and len(message) < 250: Keywords = keywords(message) Keywords = str(Keywords) new = Keywords.split('\n') for word in new: keyword = keyword + word + ', ' overview = TextBlob(message) Polarity = round(overview.sentiment.polarity, 2) Polarity = str(Polarity) Subjectivity = round(overview.sentiment.subjectivity, 2) Subjectivity = str(Subjectivity) elif len(message) > 250: Keywords = keywords(message) Keywords = str(Keywords) new = Keywords.split('\n') for word in new: keyword = keyword + word + ', ' summary = summarize(message) overview = TextBlob(message) Polarity = round(overview.sentiment.polarity, 2) Polarity = str(Polarity) Subjectivity = round(overview.sentiment.subjectivity, 2) Subjectivity = str(Subjectivity) lst = [] lst.extend([Polarity, Subjectivity, summary, keyword]) return (lst)
def keywords_from_msgs(messages: List[Message], method='gensim') -> List[str]: text = whole_text(messages) if method == 'RAKE': return topic_extraction_rake(text) elif method == 'gensim': return keywords(text) elif method == 'mglda': raise NotImplementedError("Not Yet")
def gensim_textrank_keywords(x_train, x_test=None, list_of_cols=[], new_col_name="_extracted_keywords", **algo_kwargs): """ Uses Gensim Text Rank summarize to extract keywords. Note this uses a variant of Text Rank. Parameters ---------- x_train : DataFrame Dataset x_test : DataFrame Testing dataset, by default None list_of_cols : list, optional Column name(s) of text data that you want to summarize new_col_name : str, optional New column name to be created when applying this technique, by default `_extracted_keywords` Returns ------- Dataframe, *Dataframe Transformed dataframe with the new column. Returns 2 Dataframes if x_test is provided. """ for col in list_of_cols: x_train.loc[:, col + new_col_name] = [ keywords(x, **algo_kwargs) for x in x_train[col] ] if x_test is not None: x_test.loc[:, col + new_col_name] = [ keywords(x, **algo_kwargs) for x in x_test[col] ] return x_train, x_test
def reumir(self, text): res = summarize(text, ratio=0.1) print res res2 = summarize(text, word_count=100) print res2 print(keywords(ratio=0.1))
def getKeywords(text): """ Get keywords of text with the count of the number of times they appear """ # TODO Add plural stripping (convert plural words to singular to help reduce number of dimensions) kwordsCount = {} kwords = keywords(text).strip().split('\n') for word in kwords: kwordsCount[word] = text.count(word) return kwordsCount
def gensim_keywords(): text = "Recently, I registered with the GP across the road from my flat. I’ve lived there for a year. I had put off registering beforehand due to a severe allergy to bureaucracy. Last year, I wanted to go see a specialist. I snored at my private healthcare provider’s response that I first needed to have a consultation with my doctor… Safe to say it never happened. ‘Access’, in healthcare, tends to mean availability of care, and often comes down to the affordability of the care, and the size of the audience that have access to it. My anecdotes are obviously examples of minor friction, rather than any ‘real’ issues with access to healthcare, but in the ‘on demand’ world we live in, where easy sign-up and zero-friction on-boarding is king, access also means getting the right medical ‘product’ at the time you need it. This is the advancement in healthcare that I’m most excited about. Products are now being delivered directly to the patient – which is preferable to leaving them at the mercy of the doctor’s calendar." \ "The new meaning of ‘over the counter’ In grocery stores fifty years ago, there used to be a clear divide between the consumers and the goods. Groceries could be accessed only through the shop clerk, who was the gatekeeper. It was useful to have someone from whom to advice, but over time, it became clear that it was a far better to be allowed to make their own decisions. The same switch is now happening in healthcare. Due to the improved access to information in the ‘WebMD’ era, people often have an idea of what is wrong with them when they are ill. Also, they are generally health-conscious, and therefore keen to ensure they remain healthy today’ so they don’t become a patient tomorrow. Due to this change in behavior and improved access to information, there is now room in the market for consumer-grade healthcare products and digital tools to ensure these products are available to large audiences, in new ways. Those who successfully build consumer products and brands in healthcare will win big in the next few years. Medical data is ripe for change. Our data is currently hidden in disparate patient records, but is becoming our own again thanks to platforms like PatientsKnowBest (full disclosure, Balderton is invested). Digital tools are also changing how we manage diseases. Behavior change programs are being turned into products, and scaled across a previously impossible large number of patients. This is particularly powerful when tackling healthcare problems that affect a nation of people, and emanate from poor lifestyle." print('Keywords: ') pos_filter = [ 'JJ', 'CC', 'CD', 'DT', 'JJ', 'EX', 'RB', 'WRB', 'WP$', 'WP', 'VB' ] print(keywords(text, ratio=0.1, pos_filter=pos_filter, split=True))
def get_summary(text): text = str(text) all_text = ''.join(text).replace(']', '').replace('[', '').replace("'", '') all_text = re.sub(r'\[[0-9]*\]', ' ', all_text) key_words = keywords(all_text, lemmatize=True, words=20).split('\n') summary = summarize(all_text, ratio=0.01) first_summary_sentence = summary.split('.')[0] return key_words, summary, first_summary_sentence
def _get_tags_from_cluster_summaries(self, cluster_summaries): summaries = [] for cluster_id, cluster_summary in cluster_summaries: if cluster_summary == self.ERROR_NOT_ENOUGH_POSTS_FOR_TAGS: continue summaries.append(cluster_summary) summaries = [' '.join([word for word in text.lower().split() if word not in self.corpus.stop_words]) for text in summaries] return list(enumerate(keywords('. '.join(summaries), split=True, words=15)))
def test_text_keywords_pos(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: text = f.read() # calculate keywords using only certain parts of speech generated_keywords_nnvbjj = keywords(text, pos_filter=['NN', 'VB', 'JJ'], ratio=0.3, split=True) # To be compared to the reference. with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kwpos.txt"), mode="r") as f: kw = f.read().strip().split("\n") self.assertEqual({str(x) for x in generated_keywords_nnvbjj}, {str(x) for x in kw})
def test_text_keywords(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: text = f.read() # calculate keywords generated_keywords = keywords(text, split=True) # To be compared to the reference. with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kw.txt"), mode="r") as f: kw = f.read().strip().split("\n") self.assertEqual({str(x) for x in generated_keywords}, {str(x) for x in kw})
def test_text_keywords_with_small_graph(self): # regression test, we get graph 2x2 on this text text = 'IT: Utilities A look at five utilities to make your PCs more, efficient, effective, and efficacious' kwds = keywords(text, words=1, split=True) self.assertTrue(len(kwds))
doc_id, text_body = doc_id_text_generator.next() except StopIteration: print 'not enough docs found, breaking' break concat_txt = ' '.join([concat_txt, text_body[:args.single_doc_len]]) breakout += 1 print 'used %i concatenated docs for this topic' % breakout print 'actual character length of concatenated docs: %i' % len(concat_txt) # make sure you have something if len(concat_txt) == 0: print 'got nothing for this topic' continue # TODO: make arga generate_keywords = True generate_sentences = True if generate_keywords: print '\ngenerating keywords\n------------------------------\n' summary = keywords(concat_txt, ratio=args.summary_ratio, split=True, lemmatize=True) print ', '.join(summary) if generate_sentences: print '\ngenerating sentences\n------------------------------\n' summary = summarize(concat_txt, split=True, ratio=args.summary_ratio) for sentence in summary: print ' * ' + sentence # it's sentence or keyword depending on --sentence flag
def kw(asp,pol): with open(path+asp+"/"+pol, 'r') as myfile: data=myfile.read()#.replace('\n', '.') kw=keywords(data) print "\n\n"+i+"\t\t:"+pol+"\n_______________________\n"+kw
def test_text_keywords_without_graph_edges(self): # regression test, we get graph with no edges on this text text = 'Sitio construcción. Estaremos línea.' kwds = keywords(text, deacc=False, scores=True) self.assertFalse(len(kwds))
from gensim.summarization import summarize from gensim.summarization import keywords import pandas as pd if __name__ == "__main__": df = pd.read_csv('newtestament.txt', delimiter="|", skiprows=0, names=['Book', 'Chapter', 'Verse', 'Original Text']) text = df[df['Book'] == 'Mat']['Original Text'].values alltext = '' for verse in text: alltext += verse print summarize(alltext, ratio = 0.005) print keywords(text, ratio = 0.01)