def run(): for idx, filename in enumerate(os.listdir(os.getcwd() + '/papers')): paper_path = os.getcwd() + '/papers/' + filename content = (convert_pdf_to_txt(paper_path))\ .lower()\ .replace('. ', '. ')\ .replace('. ', '. ')\ # .replace('\n\n', '\n')\ # .replace('\f', '') # .replace('\n', ' ')\ # .replace('-', ' ')\ relevant_text = get_intro_conclusion(content) raw_docs[idx] = relevant_text relevant_text = relevant_text\ .replace('. ', '. ')\ .replace('. ', '. ')\ .replace('- ', '') # Utilising NLTK Text Tiling with default params # seg_2 = TextTilingTokenizer().tokenize(relevant_text) # Utilising NLTK Text Tiling with custom params(pseudosentence size, block comparison size) tt = TextTilingTokenizer(w=10, k=4) paper_tiles = tt.tokenize(relevant_text) text_tiles[idx] = paper_tiles return raw_docs, text_tiles
def vis_tokenize(context, question): glove = utils.load_glove(dim=200) ttt = TextTilingTokenizer() para_list = [] paras = [para for para in context.split('\\n') if para != ''] for para in paras: sent_list = [] for sent in sent_tokenize(para): temp = {} temp['words'] = word_tokenize(sent) temp['vectors'] = [ np.array(glove[word.lower()]) for word in temp['words'] ] sent_list.append(temp) para_list.append(sent_list) q_dict = {} q_dict['words'] = word_tokenize(question) q_dict['vectors'] = [ np.array(glove[word.lower()]) for word in q_dict['words'] ] return para_list, q_dict
def texttiling(): conn = db_conn('map') cur = conn.cursor() tt = TextTilingTokenizer() # select all unique observation sql = 'SELECT DISTINCT(observation) FROM utterances' cur.execute(sql) unique_observs = [t[0] for t in cur.fetchall()] # for each obsv for i, obsv in enumerate(unique_observs): sql = 'SELECT utterID, tagged FROM utterances WHERE observation = %s AND tagged <> ""' cur.execute(sql, [obsv]) utter_id, tagged = zip(*cur.fetchall()) text = '\n\n\n\t'.join(tagged) try: segmented_text = tt.tokenize(text) except Exception as e: raise e else: uid_idx = 0 for j, seg in enumerate(segmented_text): topic_id = j + 1 sents = [s for s in seg.split('\n\n\n\t') if s != ''] for k, s in enumerate(sents): in_topic_id = k + 1 sql = 'UPDATE utterances SET topicID = %s, inTopicID = %s \ WHERE observation = %s AND utterID = %s' cur.execute( sql, (topic_id, in_topic_id, obsv, utter_id[uid_idx])) uid_idx += 1 conn.commit() sys.stdout.write('\r{}/{}'.format(i + 1, len(unique_observs))) sys.stdout.flush()
def texttiling_text(text, k=20, w=40, smoothing_width=10, smoothing_rounds=5): tt = TextTilingTokenizer(stopwords=raw_stopword_list, k=k, w=w, smoothing_width=smoothing_width, smoothing_rounds=smoothing_rounds) o = tt.tokenize(text) return o
def reload_tiler(self): """ Reload the text tiler. Use if memory is an issue. """ del self.__tiler self.__tiler = self.__tiler = TextTilingTokenizer( stopwords=self.__stop_words, cutoff_policy=self.__cutoff_policy, w=self.__w, k=self.__k)
def split_pp_to_paragraphs(clean_pp, contractions_dict, pattern): """ Uses TextTilingTokenizer to split to paragraphs, the privacy policy document should be pre-processed (HTML cleaned) before reaching this function. :param clean_pp: clean pp before expansion of contractions and special cases :param contractions_dict: a dictionary that includes all varieties of contractions and their expansion :param pattern: pattern for the expansion of contractions :return: list of paragraphs """ clean_pp = clean_pp_advanced(clean_pp, contractions_dict, pattern) ttt = TextTilingTokenizer() paragraphs = ttt.tokenize(clean_pp) return paragraphs
def segments(txt): ttt = TextTilingTokenizer() tokens = ttt.tokenize(txt) start = 0 end = 0 tileSpan = [] for token in tokens: end = start + len(token) tileSpan.append((start, end)) start = end return tileSpan
def demo(text=None): from nltk.corpus import brown from matplotlib import pylab tt = TextTilingTokenizer(demo_mode=True) if text is None: text = brown.raw()[:10000] s, ss, d, b = tt.tokenize(text) pylab.xlabel("Sentence Gap index") pylab.ylabel("Gap Scores") pylab.plot(range(len(s)), s, label="Gap Scores") pylab.plot(range(len(ss)), ss, label="Smoothed Gap scores") pylab.plot(range(len(d)), d, label="Depth scores") pylab.stem(range(len(b)), b) pylab.legend() pylab.show()
def __init__(self, cutoff_policy='HC', stop_words=stopwords.words('english'), w=20, k=10): """ Constructor """ self.__stop_words = stop_words self.__cutoff_policy = cutoff_policy self.__w = w self.__k = k self.__tiler = TextTilingTokenizer(stopwords=stop_words, cutoff_policy=cutoff_policy, w=w, k=k)
def segment_transcript(doc): """doc is a document object with text lines in 'transcript', add a list of 'topics' to the document object and return it """ tok = TextTilingTokenizer() lines = [turn['text'] for turn in doc['lines']] text = "\n\n".join(lines) doc['topics'] = [] start = 0 for topic in tok.tokenize(text): length = len(topic.strip().split('\n\n')) end = start + length doc['topics'].append({'start': start, 'end': end}) start = end return doc
from nltk.tokenize.texttiling import TextTilingTokenizer from nltk.tag import pos_tag, pos_tag_sents from nltk import word_tokenize import codecs from argparse import ArgumentParser import os argparser = ArgumentParser() argparser.add_argument('file', help="text document") args = argparser.parse_args() stopwords = stopwords.words('english') doc_path = os.path.splitext(args.file)[0] tt = TextTilingTokenizer() text = codecs.open(doc_path + '.txt', 'r', "utf-8").read() parags = tt.tokenize(text) buffer_tiled = '' buffer_tiled_tagged = '' buffer_tiled_tagged_clean = '' tagged_parags = pos_tag_sents([word_tokenize(p) for p in parags]) clean_parags = [ filter(lambda taggedword: taggedword[0] not in stopwords, p) for p in tagged_parags ] for i, p in enumerate(parags): buffer_tiled += p
def __init__(self): self._tt = TextTilingTokenizer()
def get_paragraphs_from_text(text): tiling_tokenizer = TextTilingTokenizer() paragraphs = tiling_tokenizer.tokenize(text) return paragraphs