def tokenize_english_document(input_text):
    """
    This is a crude tokenizer for input conversations in English.
    :param input_text:
    :return:
    """
    end_list = []
    block_tokenizer = BlanklineTokenizer()
    sentence_tokenizer = PunktSentenceTokenizer()
    word_tokenizer = WhitespaceTokenizer()
    # using the 38 characters in one line rule from ITV subtitle guidelines
    characters_per_line = 38
    lines_per_subtitle = 2

    blocks = block_tokenizer.tokenize(input_text)
    for block in blocks:
        # We have one speaker
        sentences = sentence_tokenizer.tokenize(block)
        # We have the sentences
        for sentence in sentences:
            words = word_tokenizer.tokenize(sentence)
            reverse_words = words[::-1]

            lines = []
            current_line = ''
            line_full = False
            while reverse_words:
                word = reverse_words.pop()
                longer_line = ' '.join([current_line, word]).strip()
                if len(longer_line) > characters_per_line and len(
                        current_line):
                    # The longer line is overreaching boundaries
                    reverse_words.append(word)
                    line_full = True
                elif len(word) >= characters_per_line:
                    # Very long words
                    current_line = longer_line
                    line_full = True
                else:
                    current_line = longer_line

                if line_full:
                    lines.append(current_line)
                    current_line = ''
                    line_full = False

                if len(lines) >= lines_per_subtitle:
                    end_list.append(lines)
                    lines = []
            if current_line:
                lines.append(current_line)
            if lines:
                end_list.append(lines)

    return end_list
Exemple #2
0
def tokenize_english_document(input_text):
    """
    This is a crude tokenizer for input conversations in English.
    :param input_text:
    :return:
    """
    end_list = []
    block_tokenizer = BlanklineTokenizer()
    sentence_tokenizer = PunktSentenceTokenizer()
    word_tokenizer = WhitespaceTokenizer()
    # using the 38 characters in one line rule from ITV subtitle guidelines
    characters_per_line = 38
    lines_per_subtitle = 2

    blocks = block_tokenizer.tokenize(input_text)
    for block in blocks:
        # We have one speaker
        sentences = sentence_tokenizer.tokenize(block)
        # We have the sentences
        for sentence in sentences:
            words = word_tokenizer.tokenize(sentence)
            reverse_words = words[::-1]

            lines = []
            current_line = ''
            line_full = False
            while reverse_words:
                word = reverse_words.pop()
                longer_line = ' '.join([current_line, word]).strip()
                if len(longer_line) > characters_per_line and len(current_line):
                    # The longer line is overreaching boundaries
                    reverse_words.append(word)
                    line_full = True
                elif len(word) >= characters_per_line:
                    # Very long words
                    current_line = longer_line
                    line_full = True
                else:
                    current_line = longer_line

                if line_full:
                    lines.append(current_line)
                    current_line = ''
                    line_full = False

                if len(lines) >= lines_per_subtitle:
                    end_list.append(lines)
                    lines = []
            if current_line:
                lines.append(current_line)
            if lines:
                end_list.append(lines)

    return end_list
Exemple #3
0
 def read(self, file_path):
     logger.info('Reading instances from file %s', file_path)
     reader = TaggedCorpusReader(*os.path.split(file_path),
                                 sep='\t',
                                 word_tokenizer=RegexpTokenizer(r'\n',
                                                                gaps=True),
                                 sent_tokenizer=BlanklineTokenizer(),
                                 para_block_reader=lambda s: [s.read()])
     return Dataset([
         self.text_to_instance(*tuple(zip(*tagged_sent)))
         for tagged_sent in reader.tagged_sents()
     ])
Exemple #4
0
def fun_1_1_5():
    import nltk
    from nltk.tokenize import RegexpTokenizer
    from nltk.tokenize import regexp_tokenize
    tokenizer = RegexpTokenizer("[\w]+")
    print "RegexpTokenizer:", tokenizer.tokenize(
        "Don't hesitate to ask questions")
    print "regexp_tokenizer:", regexp_tokenize(
        "Don't hesitate to ask questions", pattern="\w+|\$[\d\.]+|\S+")
    # 通过空格来执行切分
    tokenizer = RegexpTokenizer('\s+', gaps=True)
    print "RegexpTokenizer:", tokenizer.tokenize(
        "Don't hesitate to ask questions")
    # 筛选以大写字母开头的单词
    sent = " She secured 90.56 % in class X \n. She is a meritorious student"
    capt = RegexpTokenizer('[A-Z]\w+')
    print "RegexpTokenizer:", capt.tokenize(sent)
    # RegexpTokenizer 的一个子类是如何使用预定义正则表达式的
    from nltk.tokenize import BlanklineTokenizer
    print "BlanklineTokenizer:", BlanklineTokenizer().tokenize(sent)
    # 字符串的切分可以通过空格、间隔、换行等来完成
    from nltk.tokenize import WhitespaceTokenizer
    print "WhitespaceTokenizer:", WhitespaceTokenizer().tokenize(sent)
    # WordPunctTokenizer 使用正则表达式\w+|[^\w\s]+来执行文本的切分,并将其
    # 切分为字母与非字母字符
    from nltk.tokenize import WordPunctTokenizer
    print "WordPunctTokenizer:", WordPunctTokenizer().tokenize(sent)
    # 使用 split()方法进行切分
    print "split():", sent.split()
    print "split(' '):", sent.split(' ')
    print "split('\n'):", sent.split('\n')
    # 类似于 sent.split('\n')方法,LineTokenizer 通过将文本切分为行来执行切分
    from nltk.tokenize import LineTokenizer
    print "LineTokenizer:", LineTokenizer().tokenize(sent)
    print "LineTokenizer:", LineTokenizer(blanklines='keep').tokenize(sent)
    print "LineTokenizer:", LineTokenizer(blanklines='discard').tokenize(sent)
    # SpaceTokenizer 与 sent.split('')方法的工作原理类似
    from nltk.tokenize import SpaceTokenizer
    print "SpaceTokenizer:", SpaceTokenizer().tokenize(sent)
    # nltk.tokenize.util 模块通过返回元组形式的序列来执行切分,该序列为标识符
    # 在语句中的位置和偏移量
    print "标识符序列:", list(WhitespaceTokenizer().span_tokenize(sent))
    # 给定一个标识符的序列,则可以返回其跨度序列
    from nltk.tokenize.util import spans_to_relative
    print "位置和偏移:", list(
        spans_to_relative(WhitespaceTokenizer().span_tokenize(sent)))
    # 通过在每一个分隔符的连接处进行分割,nltk.tokenize.util.string_span_tokenize(sent,separator)将返回 sent 中标识符的偏移量:
    from nltk.tokenize.util import string_span_tokenize
    print "标识符序列:", list(string_span_tokenize(sent, " "))
Exemple #5
0
    def __init__(
        self,
        path: str,
        encoding: str = 'utf8',
        lower: bool = True,
        replace_digits: bool = True,
    ) -> None:
        self.__lower = lower
        self.__replace_digits = replace_digits

        word_tokenizer = RegexpTokenizer(r'\n', gaps=True)
        sent_tokenizer = BlanklineTokenizer()

        def para_block_reader(stream):
            return [stream.read()]

        super().__init__(*os.path.split(path),
                         sep='\t',
                         word_tokenizer=word_tokenizer,
                         sent_tokenizer=sent_tokenizer,
                         para_block_reader=para_block_reader,
                         encoding=encoding)
Exemple #6
0
class CorpusReader(TaggedCorpusReader):
    DIGITS = re.compile(r'\d+')
    WORD_TOK = RegexpTokenizer(r'\n', gaps=True)
    SENT_TOK = BlanklineTokenizer()

    def __init__(
        self,
        path: str,
        encoding: str = 'utf8',
        max_sent_len: int = -1,
    ) -> None:
        self.__max_sent_len = max_sent_len

        def para_block_reader(stream):
            return [stream.read()]

        super().__init__(*os.path.split(path),
                         sep='\t',
                         word_tokenizer=self.WORD_TOK,
                         sent_tokenizer=self.SENT_TOK,
                         para_block_reader=para_block_reader,
                         encoding=encoding)

    def paras(self) -> List[List[List[str]]]:
        paras = []
        for para in super().paras():
            sents = []
            for sent in para:
                if self.__max_sent_len != -1 and len(
                        sent) > self.__max_sent_len:
                    continue
                sents.append(sent)
            paras.append(sents)
        return paras

    def sents(self) -> List[List[str]]:
        return list(itertools.chain.from_iterable(self.paras()))

    def words(self) -> List[str]:
        return list(itertools.chain.from_iterable(self.sents()))

    def tagged_paras(self) -> List[List[List[Tuple[str, str]]]]:
        tagged_paras = []
        for tagged_para in super().tagged_paras():
            tagged_sents = []
            for tagged_sent in tagged_para:
                if self.__max_sent_len != -1 and len(
                        tagged_sent) > self.__max_sent_len:
                    continue
                tagged_sents.append(tagged_sent)
            tagged_paras.append(tagged_sents)
        return tagged_paras

    def tagged_sents(self) -> List[List[Tuple[str, str]]]:
        return list(itertools.chain.from_iterable(self.tagged_paras()))

    def tagged_words(self) -> List[Tuple[str, str]]:
        return list(itertools.chain.from_iterable(self.tagged_sents()))

    @classmethod
    def to_sents(cls, text: str) -> List[List[str]]:
        return [[word for word in cls.WORD_TOK.tokenize(sent)]
                for sent in cls.SENT_TOK.tokenize(text)]  # yapf: disable
Exemple #7
0
from nltk.tag import brill_trainer
import pickle

# Brill tagger parameters
max_rules = 300
min_score = 3

# Training parameters
development_size = 5110
train = .85

# Read data from development.sdx
data = TaggedCorpusReader('.',
                          r'.*\.sdx',
                          sep='|',
                          sent_tokenizer=BlanklineTokenizer())

# Get the list of tagged sentences
tagged_data = data.tagged_sents()

# Lower words and return as a list
tagged_data_list = [[t for t in sent] for sent in tagged_data]
tagged_data_list = [[(w.lower(), t) for (w, t) in s] for s in tagged_data_list]

## print "Data is read! "

# Randomize training and evaluation set
random.seed(len(tagged_data_list))
random.shuffle(tagged_data_list)
cutoff = int(development_size * train)
Exemple #8
0
def init_line_tokenizer():
    global line_tokenizer
    if line_tokenizer is None:
        line_tokenizer = BlanklineTokenizer()
Exemple #9
0
import nltk
sent = " She secured 90.56 % in class X . She is a meritorious student"
from nltk.tokenize import BlanklineTokenizer
print(BlanklineTokenizer().tokenize(sent))
Exemple #10
0
from nltk.tag import UnigramTagger
from nltk.tag import brill
from nltk.tag import brill_trainer
from nltk.tbl import Template
from nltk.tokenize import BlanklineTokenizer

# Brill tagger parameters
max_rules = 300
min_score = 3

# Training parameters
development_size = 5110
train = .85

# Read data from development.sdx
data = TaggedCorpusReader('.', r'.*\.sdx', sep='|', sent_tokenizer=BlanklineTokenizer(), encoding='ISO-8859-9')

# Get the list of tagged sentences
tagged_data = data.tagged_sents()

# Lower words and return as a list
tagged_data_list = [[t for t in sent] for sent in tagged_data]
tagged_data_list = [[(w.lower(), t) for (w, t) in s] for s in tagged_data_list]

# print "Data is read! "

# Randomize training and evaluation set
random.seed(len(tagged_data_list))
random.shuffle(tagged_data_list)
cutoff = int(development_size * train)
text2 = "This is a breaking news.\n A godzilla has been discovered in Tokyo city."
tokenizer = RegexpTokenizer('[\w]+')

print tokenizer.tokenize(text)

print regexp_tokenize(text, pattern='\w+|\$[\d\.]+\S+')

# Tokenize whitespace
tokenizer = RegexpTokenizer('\s+', gaps=True)
print tokenizer.tokenize(text)

# Select only words starting with capital letters
capt = RegexpTokenizer('[A-Z]\w+')
print capt.tokenize(text2)

print BlanklineTokenizer().tokenize(text2)

print WhitespaceTokenizer().tokenize(text2)

print LineTokenizer(blanklines='keep').tokenize(text2)
print LineTokenizer(blanklines='discard').tokenize(text2)

# SpaceTokenizer works similar to .split('')
print SpaceTokenizer().tokenize(text2)

# Returns the sequence of tuples that are offsets of the tokens
# in a sentence:
print list(WhitespaceTokenizer().span_tokenize(text2))

# Returns the sequence of relative spans
print list(spans_to_relative(WhitespaceTokenizer().span_tokenize(text2)))
Exemple #12
0
# Example 2
# A simple sentence tokenizer. Uses the regular expression '.(s+|$)',
# which means, look for a period, followed by a space,
# and use that as the delimiter.
tokens = nltk.regexp_tokenize(text, pattern=r'\.(\s+|$)', gaps=True)
# print(*tokens, sep='\n')

# Example 3
# A slightly smarter sentence tokenizer. The regular expression used means
# 'Look for a period, exclamation point, question mark, or semicolon,
# followed by a space, and use that as the delimiter.'
tokens = nltk.regexp_tokenize(text, pattern=r'[\.!?;](\s+|$)', gaps=True)
# print(*tokens, sep='\n')

# Example 4
# Split text with spaces only, preserving punctuation:
tokenizer = RegexpTokenizer(r'\s+', gaps=True)
tokens = tokenizer.tokenize(text)
# print(*tokens, sep='\n')

# Example 5
# Retrieve all capitalized words using a regular expression
capword_tokenizer = RegexpTokenizer(r'[A-Z]\w+')
tokens = capword_tokenizer.tokenize(text)
# print(*tokens, sep='\n')

# Example 6
# Example 4: Chunk text by paragraph breaks"
tokens = BlanklineTokenizer().tokenize(text)
# print(tokens)
Exemple #13
0
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('popular', quiet=True)  # for downloading packages

#Reading in the corpus
with open('Corpus.txt', 'r', encoding='utf8', errors='ignore') as fin:
    raw = fin.read().lower()

#nltk.download('punkt') # first-time use only
#nltk.download('wordnet') # first-time use only
#nltk.download('regexptokenizer')

#Tokenisation
#sent_tokens = nltk.sent_tokenize(raw)# converts to list of sentences
sent_tokens = BlanklineTokenizer().tokenize(
    raw)  # Should convert to text blocks
word_tokens = nltk.word_tokenize(raw)  # converts to list of words

# Preprocessing
lemmer = WordNetLemmatizer()


def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]


remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)


def LemNormalize(text):
    return LemTokens(
Exemple #14
0
def tweet_content():
    """Generate tweet string (140 characters or less)
    """

#    with open('basho.txt', 'r') as content_file:
#        content = content_file.read()
    r = requests.get("http://novicevagabond.com/projects/haiku/basho.txt")
    content = r.content
     
    nltk.data.path.append("nltk_data/")
    nltk.data.path.append("nltk_data/punkt")
    nltk.data.path.append("fizzle_dizzle/")
#    nltk.download()

#print content

    tokenizer = BlanklineTokenizer()
    cleaned_content = content.lower()
    corpus = TextBlob(cleaned_content,  tokenizer=tokenizer)

    haiku = corpus.sentences
#print haiku

    bigrams = corpus.ngrams(n=2)
    trigrams = corpus.ngrams(n=3)

#print bigrams
    dict = {}
    for bigram in bigrams:
        k = bigram[0]
        v = bigram[1]
        if k in dict:
            if v in dict[k]:
                dict[k][v] = dict[k][v] + 1
            else:
                dict[k][v] = 1
        else:
            dict[k] = { v : 1}

#print dict

    def weighted_choice(map):
        choices = [] 
        for k in map:
            #print k 
            for n in range(1, map[k] + 1):
                choices.append(k)
        #print choices
        choice = random.choice(choices)
        #print choice
        return choice

    seed = random.choice(dict.keys())
    length = random.randint(11,15) 

    output = [seed]
#print output
    for i in range(length):
        output.append(weighted_choice(dict[output[i]]))

    whitespace = " "
    line1 = whitespace.join(output[0:4])
    line2 = whitespace.join(output[4:9])
    line3 = whitespace.join(output[9:])
    line4 = "-- #markov_basho_haiku"
    sep = "\n"
    tweet = sep.join([line1, line2, line3, line4]);
#    print tweet
    return tweet