for _ in range(25): operations = list(processor.process(common1)) operations = list(processor.process(common2)) print("\tcommon_fast: {0}".format((time.time() - start)/50)) segment_common_fast() #profile.run('segment_common()', sort="cumulative") def segment_random(): start = time.time() for _ in range(25): operations = list(segment_matcher.diff(random1_tokens, random2_tokens)) print("\trandom: {0}".format((time.time() - start)/25)) #segment_random() #profile.run('segment_random()', sort="cumulative") common1_segments = segmenter.segment(common1_tokens) common2_segments = segmenter.segment(common2_tokens) random1_segments = segmenter.segment(random1_tokens) random2_segments = segmenter.segment(random2_tokens) print("Running segment matcher (post segmentation):") def segment_common_seg(): start = time.time() for _ in range(25): operations = list(segment_matcher.diff_segments(common1_segments, common2_segments)) print("\tcommon: {0}".format((time.time() - start)/25)) segment_common_seg() #profile.run('segment_common_seg()', sort="cumulative") def segment_random_seg(): start = time.time()
sentences = [] for paragraph_or_whitespace in segments: if isinstance(paragraph_or_whitespace, MatchableSegment): paragraph = paragraph_or_whitespace # We have a paragraph for sentence_or_whitespace in paragraph: if isinstance(sentence_or_whitespace, MatchableSegment): sentence = sentence_or_whitespace # We have a sentence sentences.append(sentence) return sentences def my_strip_code(wikicode): return "".join(_my_strip_code(wikicode)) def _my_strip_code(wikicode): for node in wikicode.nodes: stripped = node.__strip__(normalize=True, collapse=True) if isinstance(node, Wikilink): stripped = stripped.split("|")[-1] if stripped is not None: yield str(stripped) tokens = wikitext_split.tokenize(text) sentences = process_sentences(segmenter.segment(tokens)) for sentence in sentences: raw_sentence = my_strip_code(mwp.parse(str(sentence).replace("\n", " "))) print(" *", raw_sentence)
for paragraph_or_whitespace in segments: if isinstance(paragraph_or_whitespace, MatchableSegment): paragraph = paragraph_or_whitespace # We have a paragraph for sentence_or_whitespace in paragraph: if isinstance(sentence_or_whitespace, MatchableSegment): sentence = sentence_or_whitespace # We have a sentence sentences.append(sentence) return sentences def my_strip_code(wikicode): return "".join(_my_strip_code(wikicode)) def _my_strip_code(wikicode): for node in wikicode.nodes: stripped = node.__strip__(normalize=True, collapse=True) if isinstance(node, Wikilink): stripped = stripped.split("|")[-1] if stripped is not None: yield str(stripped) tokens = wikitext_split.tokenize(text) sentences = process_sentences(segmenter.segment(tokens)) for sentence in sentences: raw_sentence = my_strip_code(mwp.parse(str(sentence).replace("\n", " "))) print(" *", raw_sentence)
segment_common_fast() #profile.run('segment_common()', sort="cumulative") def segment_random(): start = time.time() for _ in range(25): operations = list(segment_matcher.diff(random1_tokens, random2_tokens)) print("\trandom: {0}".format((time.time() - start) / 25)) #segment_random() #profile.run('segment_random()', sort="cumulative") common1_segments = segmenter.segment(common1_tokens) common2_segments = segmenter.segment(common2_tokens) random1_segments = segmenter.segment(random1_tokens) random2_segments = segmenter.segment(random2_tokens) print("Running segment matcher (post segmentation):") def segment_common_seg(): start = time.time() for _ in range(25): operations = list( segment_matcher.diff_segments(common1_segments, common2_segments)) print("\tcommon: {0}".format((time.time() - start) / 25))