def test_extract_segment_short_text(tmpdir): # The text is too short for TextTilingTokenizer. Test if the fallback works ttt = TextTilingTokenizer(k=6) pipeline_config = {"passagelen": 30, "slicelen": 20, "tfchannel": True} extractor = DeepTileExtractor(tmpdir, tmpdir, pipeline_config) s = "But we in it shall be rememberèd We few, we happy few, we band of brothers" doc_toks = s.split(" ") segments = extractor.extract_segment(doc_toks, ttt) assert len(segments) == 1 # N.B - segments are in all lowercase, special chars (comma) have been removed assert segments == [ "But we in it shall be rememberèd We few, we happy few, we band of brothers" ] s = ( "But we in it shall be rememberèd We few, we happy few, we band of brothers. For he to-day that sheds his " "blood with me Shall be my brother") doc_toks = s.split(" ") segments = extractor.extract_segment(doc_toks, ttt) assert len(segments) == 2 assert segments == [ "But we in it shall be rememberèd We few, we happy few, we band of brothers. For he to-day that", "sheds his blood with me Shall be my brother", ]
def test_extract_segment_long_text(tmpdir): # nltk.TextTilingTokenizer only works with large blobs of text ttt = TextTilingTokenizer(k=6) pipeline_config = {"passagelen": 30, "slicelen": 20, "tfchannel": True} extractor = DeepTileExtractor(tmpdir, tmpdir, pipeline_config) # blob of text with Shakespeare and Shangri La. Should split into two topics s = ( "O that we now had here but one ten thousand of those men in England That do no work to-day. Whats he that " "wishes so? My cousin, Westmorland? No, my fair cousin. If we are marked to die, we are enough To do our " "country loss; and if to live, The fewer men, the greater share of honour. Gods will! I pray thee, wish" " not one man more. Shangri-La is a fictional place described in the 1933 novel Lost Horizon " "by British author James Hilton. Hilton describes Shangri-La as a mystical, harmonious valley, gently guided " "from a lamasery, enclosed in the western end of the Kunlun Mountains. Shangri-La has become synonymous with " "any earthly paradise, particularly a mythical Himalayan utopia – a permanently happy land, isolated from " "the world") doc_toks = s.split(" ") segments = extractor.extract_segment(doc_toks, ttt) assert len(segments) == 2 # The split was determined by nltk.TextTilingTokenizer. Far from perfect assert segments == [ "O that we now had here but one ten thousand of those men in England That do no work to-day. Whats he that wishes so? My cousin, Westmorland? No, my fair cousin. If we are marked to die, we are", " enough To do our country loss; and if to live, The fewer men, the greater share of honour. Gods will! I pray thee, wish not one man more. Shangri-La is a fictional place described in the 1933 novel Lost Horizon by British author James Hilton. Hilton describes Shangri-La as a mystical, harmonious valley, gently guided from a lamasery, enclosed in the western end of the Kunlun Mountains. Shangri-La has become synonymous with any earthly paradise, particularly a mythical Himalayan utopia – a permanently happy land, isolated from the world", ]
def test_deeptiles_extract_segment_short_text(tmpdir, monkeypatch, dummy_index): def fake_magnitude_embedding(*args, **kwargs): return Magnitude(None) monkeypatch.setattr(DeepTileExtractor, "_get_pretrained_emb", fake_magnitude_embedding) benchmark = DummyBenchmark() # The text is too short for TextTilingTokenizer. Test if the fallback works ttt = TextTilingTokenizer(k=6) pipeline_config = { "name": "deeptiles", "passagelen": 30, "slicelen": 20, "tfchannel": True, "tilechannels": 3, "index": { "collection": { "name": "dummy" } }, } extractor = DeepTileExtractor(pipeline_config, provide={ "index": dummy_index, "benchmark": benchmark }) s = "But we in it shall be rememberèd We few, we happy few, we band of brothers" doc_toks = s.split(" ") segments = extractor.extract_segment(doc_toks, ttt) assert len(segments) == 1 # N.B - segments are in all lowercase, special chars (comma) have been removed assert segments == [ "But we in it shall be rememberèd We few, we happy few, we band of brothers" ] s = ( "But we in it shall be rememberèd We few, we happy few, we band of brothers. For he to-day that sheds his " "blood with me Shall be my brother") doc_toks = s.split(" ") segments = extractor.extract_segment(doc_toks, ttt) assert len(segments) == 2 assert segments == [ "But we in it shall be rememberèd We few, we happy few, we band of brothers. For he to-day that", "sheds his blood with me Shall be my brother", ]
def test_deeptiles_extract_segment_long_text(tmpdir, monkeypatch, dummy_index): def fake_magnitude_embedding(*args, **kwargs): return Magnitude(None) monkeypatch.setattr(DeepTileExtractor, "_get_pretrained_emb", fake_magnitude_embedding) benchmark = DummyBenchmark() # nltk.TextTilingTokenizer only works with large blobs of text ttt = TextTilingTokenizer(k=6) extractor_config = { "name": "deeptiles", "embeddings": "glove6b", "tilechannels": 3, "passagelen": 30, "slicelen": 20, "tfchannel": True, } extractor = DeepTileExtractor(extractor_config, provide={ "index": dummy_index, "benchmark": benchmark }) # blob of text with Shakespeare and Shangri La. Should split into two topics s = ( "O that we now had here but one ten thousand of those men in England That do no work to-day. Whats he that " "wishes so? My cousin, Westmorland? No, my fair cousin. If we are marked to die, we are enough To do our " "country loss; and if to live, The fewer men, the greater share of honour. Gods will! I pray thee, wish" " not one man more. Shangri-La is a fictional place described in the 1933 novel Lost Horizon " "by British author James Hilton. Hilton describes Shangri-La as a mystical, harmonious valley, gently guided " "from a lamasery, enclosed in the western end of the Kunlun Mountains. Shangri-La has become synonymous with " "any earthly paradise, particularly a mythical Himalayan utopia – a permanently happy land, isolated from " "the world") doc_toks = s.split(" ") segments = extractor.extract_segment(doc_toks, ttt) assert len(segments) == 2 # The split was determined by nltk.TextTilingTokenizer. Far from perfect assert segments == [ "O that we now had here but one ten thousand of those men in England That do no work to-day. Whats he that wishes so? My cousin, Westmorland? No, my fair cousin. If we are marked to die, we are", " enough To do our country loss; and if to live, The fewer men, the greater share of honour. Gods will! I pray thee, wish not one man more. Shangri-La is a fictional place described in the 1933 novel Lost Horizon by British author James Hilton. Hilton describes Shangri-La as a mystical, harmonious valley, gently guided from a lamasery, enclosed in the western end of the Kunlun Mountains. Shangri-La has become synonymous with any earthly paradise, particularly a mythical Himalayan utopia – a permanently happy land, isolated from the world", ]
def _build_vocab(self, qids, docids, topics): if self.is_state_cached(qids, docids) and self.config["usecache"]: self.load_state(qids, docids) logger.info("Vocabulary loaded from cache") else: tokenize = self.tokenizer.tokenize ttt = TextTilingTokenizer(k=6) # TODO: Make K configurable? # TODO: Move the stoi and itos creation to a reusable mixin self.qid2toks = {qid: tokenize(topics[qid]) for qid in qids} self.docid2toks = {docid: tokenize(self.index.get_doc(docid)) for docid in docids} self._extend_stoi(self.qid2toks.values(), calc_idf=True) self._extend_stoi(self.docid2toks.values(), calc_idf=True) self.itos = {i: s for s, i in self.stoi.items()} self.docid2segments = { doc_id: self.clean_segments(self.extract_segment(doc_toks, ttt, slicelen=self.config["slicelen"])) for doc_id, doc_toks in tqdm(self.docid2toks.items(), desc="Extracting segments") } if self.config["usecache"]: self.cache_state(qids, docids)
from nltk import TextTilingTokenizer import re courses = ['classicalcomp-001'] tt = TextTilingTokenizer() line_break = re.compile('\n+\t+') log = open('texttile.err.log', 'w') for name in courses: infilename = '../../../feats/in' + name + '_texttile' outfilename = '../../../feats/out' + name + '_segments' out = open(outfilename, 'w') prev_c_id = None full_text = '' lines = open(infilename, 'r').readlines() length = len(lines) num_posts = 1 with open(infilename) as text: for i in range(0, length): #s, ss, d, b = c = text.readline().strip().split('\t') c_text, c_id = c[1], c[0] if prev_c_id != c_id and len(full_text) > 100 and len( re.findall(line_break, full_text)) > 1: try: #s, ss, d, b = tt.tokenize(full_text) segmented_text = tt.tokenize(full_text)