def get_concordance_polarity(main_df, uni_ents, lex_path): """ Preparing list of entities for finding concordances and their polarity :param main_df: the article dataFrame :param uni_ents: entites id from articles :param lex_path: location of lexicon files :return: dataframe with 2 columns ent_id and conc_pol """ text = nltk.Text(main_df.lemma.tolist()) c = nltk.ConcordanceIndex(text, key=lambda s: s.lower()) entities = [] for ent in uni_ents: #extract the words that correspond to the entity id word = list(set(main_df.loc[main_df['ent_id'] == ent, 'lemma'])) temp = [] for w in word: #only continue with words that are PROPN, ADJ or NOUN without duplicating if (main_df.loc[main_df['lemma'] == w, 'POS_tag'].head(1).item() in ('PROPN', 'ADJ', 'NOUN')): if w.lower() not in temp: temp.append(w.lower()) entities.append(temp) dict_entities = dict(zip(uni_ents, entities)) polarity_conc = pd.DataFrame(uni_ents, columns=['ent_id']) polarities = [ polarity_concordances(ent_v, c, text, lex_path) for ent_k, ent_v in dict_entities.items() ] polarity_conc['conc_pol'] = pd.Series(polarities) return polarity_conc
def getContext(self,phrase): phrase = phrase.lower() first_word = phrase.split(" ")[0] context = nltk.ConcordanceIndex(self.tokens) excerpt_padding = 6 excerpts = [] for i in context.offsets(first_word): start = max(0, i - excerpt_padding) end = min(len(self.tokens), i + excerpt_padding) excerpt = " ".join(self.tokens[start:end]) if phrase in excerpt: i_phrase = excerpt.index(phrase) excerpt = excerpt[i_phrase - 30:i_phrase+30+len(phrase)] if len(excerpt) < 10: continue excerpts.append(excerpt) return excerpts
def n_concordance_tokenised(self, text, phrase, left_margin=5, right_margin=5): phraseList = phrase.split(' ') c = nltk.ConcordanceIndex(text.tokens, key=lambda s: s.lower()) #Find the offset for each token in the phrase offsets = [c.offsets(x) for x in phraseList] offsets_norm = [] #For each token in the phraselist, find the offsets and rebase them to the start of the phrase for i in range(len(phraseList)): offsets_norm.append([x - i for x in offsets[i]]) intersects = set(offsets_norm[0]).intersection(*offsets_norm[1:]) concordance_txt = ([ text.tokens[list( map(lambda x: x - left_margin if (x - left_margin) > 0 else 0, [offset]))[0]:offset + len(phraseList) + right_margin] for offset in intersects ]) outputs = [ ''.join([x + ' ' for x in con_sub]) for con_sub in concordance_txt ] return outputs
def n_concordance_tokenised(text, phrase, left_margin=5, right_margin=5): #concordance replication via https://simplypython.wordpress.com/2014/03/14/saving-output-of-nltk-text-concordance/ phraseList = phrase.split(' ') c = nltk.ConcordanceIndex(text.tokens, key=lambda s: s.lower()) #Find the offset for each token in the phrase offsets = [c.offsets(x) for x in phraseList] offsets_norm = [] #For each token in the phraselist, find the offsets and rebase them to the start of the phrase for i in range(len(phraseList)): offsets_norm.append([x - i for x in offsets[i]]) #We have found the offset of a phrase if the rebased values intersect #-- # http://stackoverflow.com/a/3852792/454773 #the intersection method takes an arbitrary amount of arguments #result = set(d[0]).intersection(*d[1:]) #-- intersects = set(offsets_norm[0]).intersection(*offsets_norm[1:]) concordance_txt = ([ text. tokens[map(lambda x: x - left_margin if (x - left_margin) > 0 else 0, [offset])[0]:offset + len(phraseList) + right_margin] for offset in intersects ]) outputs = [ ''.join([x + ' ' for x in con_sub]) for con_sub in concordance_txt ] return outputs
def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin=10, right_margin=10): """ Function to get all the phases that contain the target word in a text/passage tar_passage. Workaround to save the output given by nltk Concordance function str target_word, str tar_passage int left_margin int right_margin --> list of str left_margin and right_margin allocate the number of words/pununciation before and after target word Left margin will take note of the beginning of the text """ ## Create list of tokens using nltk function tokens = nltk.word_tokenize(tar_passage) print("Tokenns------------->", tokens) ## Create the text of tokens text = nltk.Text(tokens) print("Text----", text) ## Collect all the index or offset position of the target word c = nltk.ConcordanceIndex(text.tokens, key=lambda s: s.lower()) ## Collect the range of the words that is within the target word by using text.tokens[start;end]. ## The map function is use so that when the offset position - the target range < 0, it will be default to zero concordance_txt = ([ text.tokens[map(lambda x: x - 5 if (x - left_margin) > 0 else 0, [offset])[0]:offset + right_margin] for offset in c.offsets(target_word) ]) ## join the sentences for each of the target phrase and return it return [''.join([x + ' ' for x in con_sub]) for con_sub in concordance_txt]
def get_concordance(nltk_text, word, left_margin = 10, right_margin = 10): index = nltk.ConcordanceIndex(nltk_text.tokens, key = lambda s: s.lower()) concordance_txt = ([nltk_text.tokens[list(map(lambda x: x-5 if (x-left_margin)>0 else 0,[offset]))[0]:offset+right_margin] for offset in index.offsets(word)]) output = [''.join([x+' ' for x in con_sub]) for con_sub in concordance_txt] outputFormatted = str(output).replace(",", ";") #making sure that concord. arent broken up by ',' in csv return outputFormatted
def context(target_word, tar_passage, left_margin = 10, right_margin = 10): tokens = tokenize.word_tokenize(tar_passage) text = nltk.Text(tokens) c = nltk.ConcordanceIndex(text.tokens, key = lambda s: s.lower()) concordance_txt = [text.tokens[offset - left_margin : offset + right_margin] for offset in c.offsets(target_word)] return [''.join([x + ' ' for x in con_sub]) for con_sub in concordance_txt]
def preprocess(self, text=None, stem=False, fix_pdf=True): if text is None: text = self.text def fix_pdf2txt(texto): import re texto = re.sub(r'\n([^A-Z])', r' \1', texto) texto = re.sub(r'([^\.])\n', r'\1.\n', texto) return texto def tokenizer_fr(text): # Courtesy of http://www.fabienpoulard.info/post/2008/03/05/Tokenisation-en-mots-avec-NLTK return tok_fr.tokenize(text) # Fix newline problems with pdf to txt step if fix_pdf: text = fix_pdf2txt(text) text = text.lower() # Tokenization self._original_tokens = tokenizer_fr(text) self._tokens = self._original_tokens # self._tokens = [t for t in self._tokens if len(t) > 1] if stem: from nltk.stem.snowball import FrenchStemmer fr_stemmer = FrenchStemmer() self._tokens = [fr_stemmer.stem(t) for t in self._tokens] self._concordance_index = nltk.ConcordanceIndex(self._tokens, key=lambda s: s)
def get_concordance(word, textlist): """ Print out the concordance of a word in a list of text """ for text in textlist: tokens = nltk.word_tokenize(text) ci = nltk.ConcordanceIndex(tokens) if ci.offsets(word): ci.print_concordance(word)
def kwic(target_word, pessage, left_margin=5, right_margin=5): tokens = nltk.word_tokenize(pessage) text = nltk.Text(tokens) c = nltk.ConcordanceIndex(text.tokens, key=lambda s: s.lower()) concordance_txt = ([ text.tokens[map(lambda x: x - 5 if (x - left_margin) > 0 else 0, [offset])[0]:offset + right_margin] for offset in c.offsets(target_word) ]) return [''.join([x + ' ' for x in con_sub]) for con_sub in concordance_txt]
def get_concordance(word, textlist): """ Print out the concordance of a word in a list of text """ for text in textlist: ph, tokens = get_tokens(text) phrases = get_phrases(ph) ci = nltk.ConcordanceIndex(phrases) if ci.offsets(word): ci.print_concordance(word)
def __init__(self, text): self.corpus = text.lower() self.pos_tags = pos_tag(text, True) self.word_count = len(self.pos_tags) self.c_values = [] # form [(c-value, ngram)] self.nc_values = [] # form: [(ngram, nc-value)] self.candidate_cache = [] self.context_words = defaultdict(lambda: [0, 0]) self.conc_index = nltk.ConcordanceIndex(self.pos_tags) # maps from ("token", "pos-tag") to # (freq. as context word, no. of ngrams it appears with): self.weights = defaultdict(int)
def get_all_phases_containing_tar_wrd(target_word, tar_passage, left_margin = 10, right_margin = 10): tokens = nltk.word_tokenize(tar_passage) text = nltk.Text(tokens) c = nltk.ConcordanceIndex(text.tokens, key = lambda s: s.lower()) concordance_txt = ([text.tokens[list(map(lambda x: x-5 if (x-left_margin)>0 else 0,[offset]))[0]:offset+right_margin] for offset in c.offsets(target_word)]) return [''.join([x+' ' for x in con_sub]) for con_sub in concordance_txt]
def findConcordanceText(target_word, raw, left_margin=10, right_margin=10): raw = re.sub(r'\W+', ' ', raw) tokens = nltk.word_tokenize(raw) text = nltk.Text(tokens) c = nltk.ConcordanceIndex(text.tokens, key=lambda s: s.lower()) finalText = "" for offset in c.offsets(target_word): l = offset-10 r = offset+10 if offset-10 < 0: l=0 line = ' '.join(tokens[l:r]) finalText = finalText + ' ' + line return finalText
def counterSearch(rObject): rType=0 cType=0 for o in rObject['nodes']: nid = o['nodeID'] n = int(nid) ntype = o["type"] text = nltk.ConcordanceIndex(nltk.word_tokenize(o['text'])) for w in RA: ra = w.lower() if text.offsets(ra): print("\n") text.print_concordance(ra,0,0) fromID.append(n) rType=1 print('\nMatch on nodeID:',n,"type:",ntype,"with word:",ra) for e in rObject['edges']:#check the node connections from finished product eid = e['edgeID'] etid = e['toID'] efid = e['fromID'] if etid == n or efid == n and ntype == "RA": print("\n !! RA confirmed on edge:", eid,efid,etid,ntype) es=int(eid) for x in CA: ca = x.lower() if text.offsets(ca): text.print_concordance(ca) print("CA") print(ca) for e in rObject['edges']:#check the node connections from finished product eid = e['edgeID'] etid = e['toID'] efid = e['fromID'] if nid == etid or nid == efid and ntype == "CA": print("\nCA confirmed on edge:", eid) ns=int(nid) if rType == 1: nodeType="RA" topNode=nodeCount(nid,ns) topEdge=edgeCount(es,eid) targetNode = createNode(topNode,nodeType) for x in fromID: topEdge=createEdge(topEdge,x,targetNode) if cType == 1: nodeType="CA" topNode=nodeCount(n,e) createNode(topNode,nodeType) return rObject
def word_phases(target_word, query_text, left_margin = 10, right_margin = 10): """ Function to get all the phases that contain the target word in a text/passage tar_passage. str target_word, str tar_passage int left_margin int right_margin --> list of str left_margin and right_margin allocate the number of words/pununciation before and after target word Left margin will take note of the beginning of the text """ ## Collect all the index or offset position of the target word c = nltk.ConcordanceIndex(query_text.tokens, key = lambda s: s.lower()) ## Collect the range of the words that is within the target word by using text.tokens[start;end]. ## The map function is use so that when the offset position - the target range < 0, it will be default to zero concordance_txt = ([query_text.tokens[list(map(lambda x: x-5 if (x-left_margin)>0 else 0,[offset]))[0]:offset+right_margin] for offset in c.offsets(target_word)]) ## join the sentences for each of the target phrase and return it return [''.join([x+' ' for x in con_sub]) for con_sub in concordance_txt]
def get_all_phrases_containing_tar_wrd(target_word, tar_passage, left_margin=10, right_margin=10): ## Create list of tokens using nltk function tokens = nltk.word_tokenize(tar_passage) ## Create the text of tokens text = nltk.Text(tokens) ## Collect all the index or offset position of the target word c = nltk.ConcordanceIndex(text.tokens, key=lambda s: s.lower()) ## Collect the range of the words that is within the target word by using text.tokens[start;end]. ## The map function is use so that when the offset position - the target range < 0, it will be default to zero concordance_txt = ([ text.tokens[map(lambda x: x - 5 if (x - left_margin) > 0 else 0, [offset])[0]:offset + right_margin] for offset in c.offsets(target_word) ]) ## join the sentences for each of the target phrase and return it return [''.join([x + ' ' for x in con_sub]) for con_sub in concordance_txt]
def get_all_phrases(target_word, tar_passage, left_margin=10, right_margin=10): """ Get all the phrases that contain the target word in a text tar_passage. Workaround to save the output given by nltk Concordance function. left_margin and right_margin allocate the number of words/pununciation before and after target word. :param target_word: str :param tar_passage: str :param left_margin: int :param right_margin: int :return: list """ # Create list of tokens using nltk function tokens = nltk.word_tokenize(tar_passage) # Create the text of tokens text = nltk.Text(tokens) # Collect all the index or offset position of the target word c = nltk.ConcordanceIndex(text.tokens, key=lambda s: s.lower()) # Collect the range of the words that is within the target word by using text.tokens[start;end]. # The map function is used so that when the offset position - the target range < 0, it will be default to zero concordance_txt = ([ text.tokens[list( map(lambda x: x - left_margin if (x - left_margin) > 0 else 0, [offset]))[0]:offset + right_margin if (offset - left_margin) > 0 else offset + right_margin + abs(offset - left_margin)] for offset in c.offsets(target_word) ]) # join the sentences for each of the target phrase and return it return [ ' '.join([x + ' ' for x in con_sub]) for con_sub in concordance_txt ]
def n_concordance_tokenised(text, phrase, left_margin=1, right_margin=1): phraseList = phrase.split(' ') c = nltk.ConcordanceIndex(text.tokens, key=lambda s: s.lower()) offsets = [c.offsets(x) for x in phraseList] offsets_norm = [] for i in range(len(phraseList)): offsets_norm.append([x - i for x in offsets[i]]) intersects = set(offsets_norm[0]).intersection(*offsets_norm[1:]) outputs = intersects for offset in intersects: concordance_txt_left = [] concordance_txt_right = [] concordance_txt_middle = [] start_offset = offset - left_margin if (offset - left_margin) > 0 else 0 end_offset = offset + len(phraseList) + right_margin for x in range(start_offset, offset): concordance_txt_left += [text.tokens[x]] for x in range(offset + len(phraseList), end_offset): concordance_txt_right += [text.tokens[x]] for x in range(offset, offset + len(phraseList)): concordance_txt_middle += [text.tokens[x]] yield ' '.join(concordance_txt_left), ' '.join( concordance_txt_middle), ' '.join(concordance_txt_right)
script, inputfilename, expression = argv raw = open('%s' % inputfilename).read() tokens = nltk.wordpunct_tokenize(raw) text = nltk.Text(tokens) text.concordance('%s' % expression, width=40, lines=100) text.findall('<%s><to><.*><.*>' % expression) words = [w.lower() for w in text] c = nltk.ConcordanceIndex(text.tokens) unableset = [ text.tokens[offset + 2] for offset in c.offsets('%s' % expression) ] print len(unableset) words = [w.lower() for w in unableset] vocab = sorted(set(words)) print len(vocab) fdist = nltk.FreqDist(unableset)
f = open("crash.txt") #f = open("crash.txt", 'r+', encoding="utf-8") #f = open("crashchap1") crash = f.read() #+ff.read() tokens = nltk.word_tokenize(crash) text1 = nltk.Text(tokens) #lines=text.concordance("injuries", 140, 1000) # list of terms to re-permute ll = {} for rnge in range(21): ll[rnge] = [] c = nltk.ConcordanceIndex(text1.tokens, key=lambda s: s.lower()) for offset in c.offsets('crash'): for rnge in range(21): ll[rnge] = ll[rnge] + [text1.tokens[offset + (rnge - 10)]] #print ll # permute/randomise our x terms def a(input): x = input[random.randint(0, len(input) - 1)] return x sentences = "" for x in range(100):
def textRank(tokenized_words, tag_word_dict): words_set = syntactic_filter(tag_word_dict) # Add the vertex to the graph graph_dict = {} for w in words_set: graph_dict[w] = [] graph = Graph(graph_dict) # Add the edges words = nltk.Text (tokenized_words) doc = nltk.ConcordanceIndex(words) for w in words_set: results = get_concordance(w, doc) for context in results: left = context[0].split() right = context[1].split() for l in left: if l in words_set: graph.add_edge((w, l)) graph.add_edge((l, w)) for l in right: if l in words_set: graph.add_edge((w, l)) graph.add_edge((l, w)) # Run the text rank algorithm delta = 1 i = 0 d = 0.85 while (delta > 0.0001 and i < 5000): for v in graph.vertices(): degree = graph.vertex_degree(v) old_rank = graph.text_rank(v) sum = 0 for v2 in graph.adjacency_list(v): degree2 = graph.vertex_degree(v2) # print ("Degree for " + v2 + " = " + str(degree2)) tr = graph.text_rank(v2) sum += tr / degree2 value = (1 - d) + d * sum graph.set_text_rank(v, value) if abs(value - old_rank) < delta: delta = abs(value - old_rank) i = i + 1 text_rank_dict = {} for v in graph.vertices(): text_rank_dict[v] = graph.text_rank(v) sorted_text_rank = sorted(text_rank_dict.items(), key=operator.itemgetter(1), reverse=True) return sorted_text_rank
def get_words_from_proximity( self, keyword_list, text): #think of how to get nouns from sentence only... !! #create object of doc_frequency doc_freq_obj = doc_freq_class.context() tokens = nltk.word_tokenize(text) #print "tokens:" #print tokens for i in tokens: if i.isalnum() == False: tokens.remove(i) c = nltk.ConcordanceIndex(tokens, key=lambda s: s.lower()) tokens_pos = nltk.pos_tag(tokens) i = 5 doc_freq = [] df_cnt = 0 print "keywords going to loop", print keyword_list keywords = [] for k in keyword_list: kw = nltk.word_tokenize(k) keywords.append(kw) print "keywords" print keywords for kw in keywords: print "keyword::::::::", print kw #split keyword not required as kw is list of strings #k = nltk.word_tokenize(kw) #print k #print "keywords in for ", #print kw first_word = kw[0] #1st word in keyword #print "first word" #print first_word keyword_len = len(kw) #print "LEN="+str(keyword_len) i = 5 nomatch = 0 #print "IN KWD LOOP." print "offset", print c.offsets(first_word) doc_freq.append(document_frequency(kw)) no_of_times = 0 for offset in c.offsets(first_word): print kw j = 1 i = 5 #print "Keyword=", #print kw, #print " OFFSET=" + str(offset) nomatch = 0 while j < keyword_len: #print "in while" #print tokens[offset+j] #print kw[j] if tokens[offset + j].lower() <> kw[j].lower(): #print tokens[offset+j] #print k[j] nomatch = 1 break j = j + 1 if nomatch == 0: #print "matched kwd", #print tokens[offset:offset+j-1] #print tokens[offset-5:offset+5] i = 5 while i > 0: if (offset - i) < 0: break if (tokens_pos[offset - i][1] in [ "NN", "NNP" ]) and (tokens_pos[offset - i][1].lower() not in nltk.corpus.stopwords.words('english')): #doc_freq_obj.get_together_DF("") #print "dfcnt:" + str(df_cnt) #print "i: " + str(i) doc_freq[df_cnt].addneighbour(tokens_pos[offset - i][0]) print tokens_pos[offset - i][0], #pass i = i - 1 print "\m/ ", print kw, print "\m/ ", i = 1 while i < 5: if (offset + i + (keyword_len - 1)) >= len(tokens): break if (tokens_pos[offset + i + (keyword_len - 1)][1] in [ "NN", "NNP" ]) and (tokens_pos[offset + i + (keyword_len - 1)][1].lower() not in nltk.corpus.stopwords.words('english')): #pass doc_freq[df_cnt].addneighbour( tokens[offset + i + (keyword_len - 1)]) print tokens_pos[offset + i + (keyword_len - 1)][0], i = i + 1 k = 0 print "\n\n" while k < doc_freq[df_cnt].cnt: #doc_freq[df_cnt].neighbours[k].freq_word = fd1[context_vectors[CV_cnt].keyword] doc_freq[df_cnt].neighbours[k].find_doc_freq( doc_freq[df_cnt].keyword) k = k + 1 doc_freq[df_cnt].neighbours.sort( key=lambda x: x.freq_together, reverse=True) if doc_freq[df_cnt].cnt > 5: doc_freq[df_cnt].neighbours = doc_freq[ df_cnt].neighbours[: 5] #take 10 neighbours with highest weight doc_freq[df_cnt].cnt = 5 k = 0 #while k < doc_freq[df_cnt].cnt: print "keyword: ", for l in doc_freq[df_cnt].keyword: print l, print "\n" print "neighbours: ", for m in doc_freq[df_cnt].neighbours: print m.word, print "\n" #k += 1 no_of_times = no_of_times + 1 if no_of_times >= 2: break #import pdb;pdb.set_trace(); df_cnt = df_cnt + 1 #results = search_web(doc_freq) print doc_freq return doc_freq
def main(argv, matches=2): fName = 'bbc/politics/' + str(argv) f = open(fName, 'r') raw_text = f.read() # Tokenize the tokenized_words of the text tokenized_words = nltk.word_tokenize(raw_text) # Making the tokenized_words to lower case for i in range(len(tokenized_words)): tokenized_words[i] = tokenized_words[i].lower() # POS tag the words tagged_words = nltk.pos_tag(tokenized_words) # Extracting the tags of the text tags = set([tag for (word, tag) in tagged_words]) word_tag_dict = {} tag_word_dict = {} for (word, tag) in tagged_words: if word in word_tag_dict.keys(): word_tag_dict[word.lower()].append(tag) else: word_tag_dict[word.lower()] = [tag] if tag in tag_word_dict.keys(): tag_word_dict[tag].append(word) else: tag_word_dict[tag] = [word] words = nltk.Text(tokenized_words) doc = nltk.ConcordanceIndex(words) stemmer = PorterStemmer() # # Call text Rank # sorted_text_rank = textRank(tokenized_words, tag_word_dict) # set1 = set([w.lower() for (w, val) in sorted_text_rank[:15]]) # removeList = [] # for w in set1: # if stemmer.stem(w) != w and stemmer.stem(w) in set1: # removeList.append(w) # for w in removeList: # set1.remove(w) # sorted_text_rank = [(w, val) for (w, val) in sorted_text_rank[:15] if w not in removeList] # offset_dict_text_rank = {} # for words1 in set1: # offset_dict_text_rank[words1] = doc.offsets(words1) # Call tf sorted_tfValues = tf(tokenized_words, word_tag_dict) set2 = set([w.lower() for (w, val) in sorted_tfValues[:15]]) removeList = [] for w in set2: if stemmer.stem(w) != w and stemmer.stem(w) in set2: removeList.append(w) for w in removeList: set2.remove(w) sorted_tfValues = [(w, val) for (w, val) in sorted_tfValues[:15] if w not in removeList] offset_dict_tf = {} for words2 in set2: offset_dict_tf[words2] = doc.offsets(words2) # # Call tf-idf # sorted_tf_idf = tfIdf (raw_text, word_tag_dict) # set3 = set([w for (w, val) in sorted_tf_idf[:15]]) # removeList = [] # for w in set3: # if stemmer.stem(w) != w and stemmer.stem(w) in set3: # removeList.append(w) # for w in removeList: # set3.remove(w) # sorted_tf_idf = [(w, val) for (w, val) in sorted_tf_idf[:15] if w not in removeList] # offset_dict_tf_idf = {} # for words3 in set3: # offset_dict_tf_idf[words3] = doc.offsets(words3) """ Printing the resuts""" # print (raw_text) # print ("\n\nText Rank of the document:") # printResult (sorted_text_rank, word_tag_dict, offset_dict_text_rank) # printTable (sorted_text_rank, offset_dict_text_rank) # printMatrix (offset_dict_text_rank) print("\n\nTf Scores of the document:\n") printResult(sorted_tfValues, word_tag_dict, offset_dict_tf) out_list, tid_word_dict = printTable(sorted_tfValues, offset_dict_tf) words_list = printMatrix(offset_dict_tf) print_top_sentence(raw_text, sorted_tfValues, matches, out_list, tid_word_dict, words_list) print_sentences(raw_text, sorted_tfValues, tid_word_dict, words_list)
def context(word): return nltk.ConcordanceIndex(tokens)
text_parts = [] for word_num, word in enumerate(words): if stemmer.stem(word) == target_stem: start = max(word_num - context_size, 0) stop = word_num + context_size + 1 text_parts.append(words[start:stop]) return text_parts print "Version 2:" text_parts = show_word_in_context2("scared", text) print "Found {} occurences:".format(len(text_parts)) for part in text_parts: print " ".join(part) # Bonus # ~~~~~ # NLTK has some ready made concordance related objects. In particular, a # possible solution to the problem could be: print "Version 3:" words = nltk.word_tokenize(text) stemmer = nltk.LancasterStemmer() c_stemmed = nltk.ConcordanceIndex(words, key=lambda s: stemmer.stem(s.lower())) print c_stemmed.print_concordance("scared") # The object offers more convenience functions. The locations for the matches # are available with the `offsets` method. That allows to collect the words # that follow the matches for example: print[words[i + 1] for i in c_stemmed.offsets("scared")]
def find_word(file_lines, forward_sentence_type_in_a_list, forward_identify_section_span, word): result_for_csv_file_before = [] for line_index_number, line_text in enumerate(file_lines): word_tokens = word_tokenize(line_text, language="english") sentence_word_count = len(word_tokens) if sentence_word_count > 0: nltk_concordance_index = nltk.ConcordanceIndex(word_tokens) for offset in nltk_concordance_index.offsets(word): identified_sentence_part_and_sub_part_as_tuple = get_sentence_type_from_sentence_index_number( line_index_number, forward_sentence_type_in_a_list) part_value = identified_sentence_part_and_sub_part_as_tuple[0] sub_part_value = identified_sentence_part_and_sub_part_as_tuple[ 1] if is_float(word_tokens[offset + 1] and word_tokens[offset + 3]): paragraph_number = get_section_from_sentence_index_number( line_index_number, forward_identify_section_span) relation = word_tokens[offset] value_1 = word_tokens[offset + 1] + " " + word_tokens[ offset + 2] + " " + word_tokens[offset + 3] if word_tokens[offset + 2] == "January" or word_tokens[ offset + 2] == "February" or word_tokens[ offset + 2] == "March" or word_tokens[ offset + 2] == "April" or word_tokens[ offset + 2] == "May" or word_tokens[ offset + 2] == "June" or word_tokens[ offset + 2] == "July" or word_tokens[ offset + 2] == "August" or word_tokens[ offset + 2] == "September" or word_tokens[ offset + 2] == "October" or word_tokens[ offset + 2] == "November" or word_tokens[ offset + 2] == "December": # Remove the word "Section " for the Java application only_output_section_number = paragraph_number.replace( "Section ", "") result_for_csv_file_before.append([ only_output_section_number, part_value, sub_part_value, relation, value_1 ]) return list(result_for_csv_file_before)