Example #1
0
from deltas.segmenters import ParagraphsSentencesAndWhitespace
from deltas import sequence_matcher, segment_matcher
import sys
sys.path.insert(0, ".")


a = ["This", "is", "a", "sentence", ".", "  ",
     "This", "isn't", "a", "sentence", "."]
b = ["This", "isn't", "a", "sentence", ".",
     "  ", "This", "is", "a", "sentence", "."]

print("Comparing:")
print(" - A: {0}".format(a))
print(" - B: {0}".format(b))

print("\n")

print("Longest common substring:")
for operation in sequence_matcher.diff(a, b):
    print("--> " + str(operation))

print("\n")
print("Segment matcher:")
for operation in segment_matcher.diff(
        a, b, segmenter=ParagraphsSentencesAndWhitespace()):
    print("--> " + str(operation))
Example #2
0
import cProfile as profile
import random
import time
import pickle

from deltas import segment_matcher, sequence_matcher
from deltas.segmenters import ParagraphsSentencesAndWhitespace
from deltas.tokenizers import wikitext_split
from mw import api

tokenizer = wikitext_split
segmenter = ParagraphsSentencesAndWhitespace()

session = api.Session("https://en.wikipedia.org/w/api.php")
common1 = session.revisions.get(638029546, properties={"content"})['*']
common2 = session.revisions.get(638077284, properties={"content"})['*']

common1_tokens = list(tokenizer.tokenize(common1))
common2_tokens = list(tokenizer.tokenize(common2))

words = [l.strip() for l in open('/usr/share/dict/words')]
random1 = ''.join(random.choice(words) if t.type == "word" else str(t)
                  for t in common1_tokens)
random2 = ''.join(random.choice(words) if t.type == "word" else str(t)
                  for t in common1_tokens)

random2_tokens = list(tokenizer.tokenize(random2))
random1_tokens = list(tokenizer.tokenize(random1))

print("Tokenizing:")
def tokenize_common():
'''Biology''' is a [[natural science]] concerned with the study of [[life]] and living [[organism]]s, including their structure, function, growth, [[evolution]], distribution, identification and [[Taxonomy (biology)|taxonomy]].<ref name=aquarenagloss>Based on definition from: {{cite web |url=http://www.bio.txstate.edu/~wetlands/Glossary/glossary.html |archiveurl=https://web.archive.org/web/20040608113114/http://www.bio.txstate.edu/~wetlands/Glossary/glossary.html |archivedate=2004-06-08 |title=Aquarena Wetlands Project glossary of terms |author=<!--Staff writer(s); no by-line.-->  |publisher=Texas State University at San Marcos }}</ref>  Modern biology is a vast and eclectic field, composed of many [[#Branches|branches and subdisciplines]]. However, despite the broad scope of biology, there are certain general and unifying concepts within it that govern all study and research, consolidating it into single, coherent field. In general, biology recognizes the [[Cell (biology)|cell]] as the basic unit of life, [[genes]] as the basic unit of [[heredity]], and [[evolution]] as the engine that propels the synthesis and creation of new [[species]]. It is also understood today that all the organisms survive by consuming and transforming [[energy]] and by [[homeostasis|regulating]] their internal environment to maintain a stable and vital condition known as [[homeostasis]].

Sub-disciplines of biology are defined by the scale at which organisms are studied, the kinds of organisms studied, and the methods used to study them: [[biochemistry]] examines the rudimentary chemistry of life; [[molecular biology]] studies the complex interactions among biological [[molecule]]s; [[botany]] studies the biology of plants; [[cellular biology]] examines the basic building-block of all life, the [[cell (biology)|cell]]; [[physiology]] examines the physical and chemical functions of [[tissue (biology)|tissues]], [[Organ (anatomy)|organs]], and [[organ system]]s of an organism; [[evolutionary biology]] examines the processes that produced the diversity of life; and [[ecology]] examines how organisms interact in their [[environment (biophysical)|environment]].<ref>{{cite web|url=http://community.weber.edu/sciencemuseum/pages/life_main.asp |title=Life Science, Weber State Museum of Natural Science |publisher=Community.weber.edu |accessdate=2013-10-02}}</ref>

==History==
{{Main|History of biology}}
[[File:Hooke-bluefly.jpg|thumb|alt=A drawing of a fly from facing up, with wing detail|A Diagram of a fly from [[Robert Hooke|Robert Hooke's]] innovative [[Micrographia]], 1665]]
[[File:Tree of life by Haeckel.jpg|thumb|alt=Ernst Haeckel's pedigree of Man family tree from Evolution of Man|[[Ernst Haeckel]]'s Tree of Life (1879)]]
The term ''[[wikt:biology|biology]]'' is derived from the [[Greek Language|Greek]] word {{lang|grc|[[wikt:βίος|βίος]]}}, ''bios'', "[[life]]" and the suffix {{lang|grc|[[wikt:-λογία|-λογία]]}}, ''-logia'', "study of."<ref>{{cite web |url=http://topics.info.com/Who-coined-the-term-biology_716 |title=Who coined the term biology? |work=Info.com|accessdate=2012-06-03}}</ref><ref name=OnlineEtDict>{{cite web|title=biology|url=http://www.etymonline.com/index.php?term=biology&allowed_in_frame=0|publisher=[[Online Etymology Dictionary]]}}</ref> The Latin-language form of the term first appeared in 1736 when Swedish scientist [[Carl Linnaeus]] (Carl von Linné) used ''biologi'' in his ''Bibliotheca botanica''. It was used again in 1766 in a work entitled ''Philosophiae naturalis sive physicae: tomus III, continens geologian, biologian, phytologian generalis'', by [[Michael Christoph Hanow|Michael Christoph Hanov]], a disciple of [[Christian Wolff (philosopher)|Christian Wolff]]. The first German use, ''Biologie'', was in a 1771 translation of Linnaeus' work. In 1797, Theodor Georg August Roose used the term in the preface of a book, ''Grundzüge der Lehre van der Lebenskraft''. [[Karl Friedrich Burdach]] used the term in 1800 in a more restricted sense of the study of human beings from a morphological, physiological and psychological perspective (''Propädeutik zum Studien der gesammten Heilkunst''). The term came into its modern usage with the six-volume treatise ''Biologie, oder Philosophie der lebenden Natur'' (1802–22) by [[Gottfried Reinhold Treviranus]], who announced:<ref name=Richards>{{cite book|last=Richards|first=Robert J.|title=The Romantic Conception of Life: Science and Philosophy in the Age of Goethe|year=2002|publisher=University of Chicago Press|isbn=0-226-71210-9|url=https://books.google.com/?id=X7N4_i7vrTUC&printsec=frontcover#v=onepage&q&f=false}}</ref>

:The objects of our research will be the different forms and manifestations of life, the conditions and laws under which these phenomena occur, and the causes through which they have been effected. The science that concerns itself with these objects we will indicate by the name biology [Biologie] or the doctrine of life [Lebenslehre].

"""

segmenter = ParagraphsSentencesAndWhitespace()


def process_sentences(segments):
    sentences = []
    for paragraph_or_whitespace in segments:
        if isinstance(paragraph_or_whitespace, MatchableSegment):
            paragraph = paragraph_or_whitespace  # We have a paragraph
            for sentence_or_whitespace in paragraph:
                if isinstance(sentence_or_whitespace, MatchableSegment):
                    sentence = sentence_or_whitespace  # We have a sentence
                    sentences.append(sentence)
    return sentences


def my_strip_code(wikicode):
'''Biology''' is a [[natural science]] concerned with the study of [[life]] and living [[organism]]s, including their structure, function, growth, [[evolution]], distribution, identification and [[Taxonomy (biology)|taxonomy]].<ref name=aquarenagloss>Based on definition from: {{cite web |url=http://www.bio.txstate.edu/~wetlands/Glossary/glossary.html |archiveurl=https://web.archive.org/web/20040608113114/http://www.bio.txstate.edu/~wetlands/Glossary/glossary.html |archivedate=2004-06-08 |title=Aquarena Wetlands Project glossary of terms |author=<!--Staff writer(s); no by-line.-->  |publisher=Texas State University at San Marcos }}</ref>  Modern biology is a vast and eclectic field, composed of many [[#Branches|branches and subdisciplines]]. However, despite the broad scope of biology, there are certain general and unifying concepts within it that govern all study and research, consolidating it into single, coherent field. In general, biology recognizes the [[Cell (biology)|cell]] as the basic unit of life, [[genes]] as the basic unit of [[heredity]], and [[evolution]] as the engine that propels the synthesis and creation of new [[species]]. It is also understood today that all the organisms survive by consuming and transforming [[energy]] and by [[homeostasis|regulating]] their internal environment to maintain a stable and vital condition known as [[homeostasis]].

Sub-disciplines of biology are defined by the scale at which organisms are studied, the kinds of organisms studied, and the methods used to study them: [[biochemistry]] examines the rudimentary chemistry of life; [[molecular biology]] studies the complex interactions among biological [[molecule]]s; [[botany]] studies the biology of plants; [[cellular biology]] examines the basic building-block of all life, the [[cell (biology)|cell]]; [[physiology]] examines the physical and chemical functions of [[tissue (biology)|tissues]], [[Organ (anatomy)|organs]], and [[organ system]]s of an organism; [[evolutionary biology]] examines the processes that produced the diversity of life; and [[ecology]] examines how organisms interact in their [[environment (biophysical)|environment]].<ref>{{cite web|url=http://community.weber.edu/sciencemuseum/pages/life_main.asp |title=Life Science, Weber State Museum of Natural Science |publisher=Community.weber.edu |accessdate=2013-10-02}}</ref>

==History==
{{Main|History of biology}}
[[File:Hooke-bluefly.jpg|thumb|alt=A drawing of a fly from facing up, with wing detail|A Diagram of a fly from [[Robert Hooke|Robert Hooke's]] innovative [[Micrographia]], 1665]]
[[File:Tree of life by Haeckel.jpg|thumb|alt=Ernst Haeckel's pedigree of Man family tree from Evolution of Man|[[Ernst Haeckel]]'s Tree of Life (1879)]]
The term ''[[wikt:biology|biology]]'' is derived from the [[Greek Language|Greek]] word {{lang|grc|[[wikt:βίος|βίος]]}}, ''bios'', "[[life]]" and the suffix {{lang|grc|[[wikt:-λογία|-λογία]]}}, ''-logia'', "study of."<ref>{{cite web |url=http://topics.info.com/Who-coined-the-term-biology_716 |title=Who coined the term biology? |work=Info.com|accessdate=2012-06-03}}</ref><ref name=OnlineEtDict>{{cite web|title=biology|url=http://www.etymonline.com/index.php?term=biology&allowed_in_frame=0|publisher=[[Online Etymology Dictionary]]}}</ref> The Latin-language form of the term first appeared in 1736 when Swedish scientist [[Carl Linnaeus]] (Carl von Linné) used ''biologi'' in his ''Bibliotheca botanica''. It was used again in 1766 in a work entitled ''Philosophiae naturalis sive physicae: tomus III, continens geologian, biologian, phytologian generalis'', by [[Michael Christoph Hanow|Michael Christoph Hanov]], a disciple of [[Christian Wolff (philosopher)|Christian Wolff]]. The first German use, ''Biologie'', was in a 1771 translation of Linnaeus' work. In 1797, Theodor Georg August Roose used the term in the preface of a book, ''Grundzüge der Lehre van der Lebenskraft''. [[Karl Friedrich Burdach]] used the term in 1800 in a more restricted sense of the study of human beings from a morphological, physiological and psychological perspective (''Propädeutik zum Studien der gesammten Heilkunst''). The term came into its modern usage with the six-volume treatise ''Biologie, oder Philosophie der lebenden Natur'' (1802–22) by [[Gottfried Reinhold Treviranus]], who announced:<ref name=Richards>{{cite book|last=Richards|first=Robert J.|title=The Romantic Conception of Life: Science and Philosophy in the Age of Goethe|year=2002|publisher=University of Chicago Press|isbn=0-226-71210-9|url=https://books.google.com/?id=X7N4_i7vrTUC&printsec=frontcover#v=onepage&q&f=false}}</ref>

:The objects of our research will be the different forms and manifestations of life, the conditions and laws under which these phenomena occur, and the causes through which they have been effected. The science that concerns itself with these objects we will indicate by the name biology [Biologie] or the doctrine of life [Lebenslehre].

"""


segmenter = ParagraphsSentencesAndWhitespace()


def process_sentences(segments):
    sentences = []
    for paragraph_or_whitespace in segments:
        if isinstance(paragraph_or_whitespace, MatchableSegment):
            paragraph = paragraph_or_whitespace  # We have a paragraph
            for sentence_or_whitespace in paragraph:
                if isinstance(sentence_or_whitespace, MatchableSegment):
                    sentence = sentence_or_whitespace  # We have a sentence
                    sentences.append(sentence)
    return sentences


def my_strip_code(wikicode):
Example #5
0
import cProfile as profile
import random
import time
import pickle

from deltas import segment_matcher, sequence_matcher
from deltas.segmenters import ParagraphsSentencesAndWhitespace
from deltas.tokenizers import wikitext_split, text_split
from mw import api

segmenter = ParagraphsSentencesAndWhitespace()

session = api.Session("https://en.wikipedia.org/w/api.php")
common1 = session.revisions.get(638029546, properties={"content"})['*']
common2 = session.revisions.get(638077284, properties={"content"})['*']

common1_tokens = list(wikitext_split.tokenize(common1))
common2_tokens = list(wikitext_split.tokenize(common2))

words = [l.strip() for l in open('/usr/share/dict/words')]
random1 = ''.join(
    random.choice(words) if t.type == "word" else str(t)
    for t in common1_tokens)
random2 = ''.join(
    random.choice(words) if t.type == "word" else str(t)
    for t in common1_tokens)

random2_tokens = list(wikitext_split.tokenize(random2))
random1_tokens = list(wikitext_split.tokenize(random1))

print("Tokenizing:")
Example #6
0

def is_uppercase_word(word_token):
    return len(word_token) > 1 and \
        sum(c.lower() != c for c in word_token) == len(word_token)


class TokenIsInTypes:
    def __init__(self, types):
        self.types = set(types)

    def filter(self, token):
        return token.type in self.types


def _process_tokens(text):
    return [t for t in wikitext_split.tokenize(text or "")]


def tokenized(text_datasource, name=None):
    """
    Constructs a :class:`revision.Datasource` that generates a list of tokens
    """
    if name is None:
        name = "{0}({1})".format("tokenized", text_datasource)

    return Datasource(name, _process_tokens, depends_on=[text_datasource])


paragraphs_sentences_and_whitespace = ParagraphsSentencesAndWhitespace()