Beispiel #1
0
    def __index_content(url_id, db, soup):
        title = soup.title.text
        if title is not None:
            if isinstance(title, basestring):
                title.encode('utf8')
            else:
                unicode(title).encode('utf8')

        content = soup.find("div", {"id": "mw-content-text"}).text

        if isinstance(content, basestring):
            content.encode('utf8')
        else:
            unicode(content).encode('utf8')

        # content = soup.text

        custom_tokenizer = PunktSentenceTokenizer()
        tokenized_sentences = custom_tokenizer.tokenize(unicode(content))

        page = dict()
        page["title"] = title
        hints_list = list()
        try:
            for sentence in tokenized_sentences:
                words = nltk.word_tokenize(sentence)
                tagged = nltk.pos_tag(words)

                grammar = r"""NP: {<DT|PP\$>?<JJ>*<NN>}
                {<NNP>+}"""
                chunk_parser = nltk.RegexpParser(grammar)
                chunked = chunk_parser.parse(tagged)
                for chunk in chunked.subtrees():
                    if chunk.label() == "NP":
                        line = list()
                        for each in chunk.leaves():
                            if len(each[0]) > 2:
                                line.append(each[0])
                        if len(line) > 0:
                            final_value = (" ".join(line)).lower()
                            hints_list.append(final_value)
                page["hints"] = hints_list

        except Exception as e:
            print(str(e))
        db.known_urls.update_one({"_id": url_id}, {"$set": {"content": page}})

        page_content_size = len(page["hints"])
        print(colored("\t\tUpdated With Indexed Content", "yellow"))

        # current_dir = os.getcwd()
        # files_dir = current_dir + "/Originals/"
        # file_name = url_id
        # file_path = files_dir + str(file_name)
        # created_file = open(file_path, "w")
        # created_file.write(content.encode("utf-8"))
        # created_file.close()
        # print("\t\tOriginal Content Is Saved")
        return page_content_size
        return
Beispiel #2
0
def tokenize_english_document(input_text):
    """
    This is a crude tokenizer for input conversations in English.
    :param input_text:
    :return:
    """
    end_list = []
    block_tokenizer = BlanklineTokenizer()
    sentence_tokenizer = PunktSentenceTokenizer()
    word_tokenizer = WhitespaceTokenizer()
    # using the 38 characters in one line rule from ITV subtitle guidelines
    characters_per_line = 38
    lines_per_subtitle = 2

    blocks = block_tokenizer.tokenize(input_text)
    for block in blocks:
        # We have one speaker
        sentences = sentence_tokenizer.tokenize(block)
        # We have the sentences
        for sentence in sentences:
            words = word_tokenizer.tokenize(sentence)
            reverse_words = words[::-1]

            lines = []
            current_line = ''
            line_full = False
            while reverse_words:
                word = reverse_words.pop()
                longer_line = ' '.join([current_line, word]).strip()
                if len(longer_line) > characters_per_line and len(
                        current_line):
                    # The longer line is overreaching boundaries
                    reverse_words.append(word)
                    line_full = True
                elif len(word) >= characters_per_line:
                    # Very long words
                    current_line = longer_line
                    line_full = True
                else:
                    current_line = longer_line

                if line_full:
                    lines.append(current_line)
                    current_line = ''
                    line_full = False

                if len(lines) >= lines_per_subtitle:
                    end_list.append(lines)
                    lines = []
            if current_line:
                lines.append(current_line)
            if lines:
                end_list.append(lines)

    return end_list
def pre_segment(doc):
    """Set sentence boundaries with nltk instead of spacy."""
    if len(str(doc.text).split()) > 3:
        tokenizer = PunktSentenceTokenizer(doc.text)
        sentences = tokenizer.tokenize(doc.text)
        for nltk_sentence in sentences:
            words = re.findall(r"[\w]+|[^\s\w]", nltk_sentence)
            for i in range(len(doc) - len(words) + 1):
                token_list = [str(token) for token in doc[i:i + len(words)]]
                if token_list == words:
                    doc[i].is_sent_start = True
                    for token in doc[i + 1:i + len(words)]:
                        token.is_sent_start = False
    return doc
Beispiel #4
0
def sentence_tokenizer(text):
    """
    Tokenizes sentences.

    :param text:
    :return: list of sentences (a sentence is a string)
    """
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = {
        'zzgl', 'prof', 'ca', 'vj', 't', 'mio', 'sro', 'lv', 'io', 'ihv',
        'bzw', 'usw', 'inkl', 'zt', 'vh', 'dr', 'entspr', 'dem', 'fort', 'co',
        'kg', 'zb', 'bspw', 'ua', 'rd', 'abs', 'etc', 'tsd', 'z.b', 'evtl',
        '1', '2', '3', '4', '5', '6', '7', '8', '9', '19', '20', '21'
    }
    sentence_splitter = PunktSentenceTokenizer(punkt_param)
    return sentence_splitter.tokenize(text)
Beispiel #5
0
def _extract_text_from_elements(elements: Element, punkt: bool,
                                keep_xml: bool) -> List[str]:
    examples = []
    if keep_xml:
        for e in elements:
            xml_str = tostring(e).decode('utf-8')  # tostring returns bytes
            length = len(innertext(e))
            if length > config.min_char_length:
                examples.append(xml_str)
    else:
        for e in elements:
            text = innertext(e)
            if punkt:
                sentences = PunktSentenceTokenizer().tokenize(text=text)
                filtered_sentences = [s for s in sentences if self._filter(s)]
                examples += filtered_sentences
            else:
                if _filter(text):
                    examples.append(text)
    return examples
Beispiel #6
0
def sentence_split(input_text):
    input_text = "<root>" + input_text + "</root>"

    soup = BeautifulSoup(input_text, "xml")
    paragraphs = []
    for doc in soup.find('root').findAll('DOC'):
        if doc['type'] == 'story':
            headlines = doc('HEADLINE')
            for h in headlines:
                paragraphs.append(h.contents[0])
            p_blocks = doc.find('TEXT').findAll('P')
            for p in p_blocks:
                paragraphs.append(p.contents[0])
        elif doc['type'] == 'multi':
            paragraphs.append(doc.find('TEXT').contents[0])

    sentences = []
    punkt = PunktSentenceTokenizer()
    for parag in paragraphs:
        for sent in punkt.sentences_from_text(parag, realign_boundaries=True):
            sentences.append(replace.sub(' ', sent).strip())
    return sentences
Beispiel #7
0
def sent_tokenize(text):
    model_path = join(dirname(__file__), 'sent_tokenize_model_v1.0.pkl')
    with open(model_path, 'rb') as fs:
        punkt_param = pickle.load(fs)

    punkt_param.sent_starters = {}
    abbrev_types = [
        'g.m.t', 'e.g', 'dr', 'dr', 'vs', "000", 'mr', 'mrs', 'prof', 'inc',
        'tp', 'ts', 'ths', 'th', 'vs', 'tp', 'k.l', 'a.w.a.k.e', 't', 'a.i',
        '</i', 'g.w', 'ass', 'u.n.c.l.e', 't.e.s.t', 'ths', 'd.c', 've…', 'ts',
        'f.t', 'b.b', 'z.e', 's.g', 'm.p', 'g.u.y', 'l.c', 'g.i', 'j.f', 'r.r',
        'v.i', 'm.h', 'a.s', 'bs', 'c.k', 'aug', 't.d.q', 'b…', 'ph', 'j.k',
        'e.l', 'o.t', 's.a'
    ]
    abbrev_types.extend(string.ascii_uppercase)
    for abbrev_type in abbrev_types:
        punkt_param.abbrev_types.add(abbrev_type)
    for abbrev_type in string.ascii_lowercase:
        punkt_param.abbrev_types.add(abbrev_type)
    tokenizer = PunktSentenceTokenizer(punkt_param)
    sentences = tokenizer.sentences_from_text(text)
    return sentences
def _load_model():
    global sentence_tokenizer
    if sentence_tokenizer is not None:
        return
    model_path = join(dirname(__file__), 'st_kiss-strunk-2006_2019_01_13.pkl')
    with open(model_path, 'rb') as fs:
        punkt_param = pickle.load(fs)

    punkt_param.sent_starters = {}
    abbrev_types = [
        'g.m.t', 'e.g', 'dr', 'dr', 'vs', "000", 'mr', 'mrs', 'prof', 'inc',
        'tp', 'ts', 'ths', 'th', 'vs', 'tp', 'k.l', 'a.w.a.k.e', 't', 'a.i',
        '</i', 'g.w', 'ass', 'u.n.c.l.e', 't.e.s.t', 'ths', 'd.c', 've…', 'ts',
        'f.t', 'b.b', 'z.e', 's.g', 'm.p', 'g.u.y', 'l.c', 'g.i', 'j.f', 'r.r',
        'v.i', 'm.h', 'a.s', 'bs', 'c.k', 'aug', 't.d.q', 'b…', 'ph', 'j.k',
        'e.l', 'o.t', 's.a'
    ]
    abbrev_types.extend(string.ascii_uppercase)
    for abbrev_type in abbrev_types:
        punkt_param.abbrev_types.add(abbrev_type)
    for abbrev_type in string.ascii_lowercase:
        punkt_param.abbrev_types.add(abbrev_type)
    sentence_tokenizer = PunktSentenceTokenizer(punkt_param)
Beispiel #9
0
for line in lines:
    line_dict = dict(line)
    if "EKYWD" in line_dict and "EABST" in line_dict:
        keywords = re.split("\t", line_dict["EKYWD"])
        all_keywords.update(set(keywords))

sys.stderr.write("Tokenize keywords\n")

keywords = []

for keyword in all_keywords:
    keywords.append(word_tokenize(keyword))

sys.stderr.write("All keywords: " + str(len(all_keywords)) + "\n")

p = PunktSentenceTokenizer()

for i in range(len(lines)):
    if i % 10 == 0:
        sys.stderr.write(str(i) + "/" + str(len(lines)) + "\n")
    line = dict(lines[i])
    if not ("EKYWD" in line and "EABST" in line):
        continue
    abstract = line["EABST"]
    abstract = p.tokenize(abstract)
    abstract = [word_tokenize(sentence) for sentence in abstract]
    for sentence in abstract:
        j = 0
        while j < len(sentence):
            found = False
            for k in range(len(keywords)):
Beispiel #10
0
 def tokenize_to_sentences(self, paragraph):
     tokenizer = PunktSentenceTokenizer()
     sentences = tokenizer.tokenize(paragraph)
     return sentences
Beispiel #11
0
def get_sentence_tokenizer():
    # https://textminingonline.com/dive-into-nltk-part-ii-sentence-tokenize-and-word-tokenize
    return PunktSentenceTokenizer()
Beispiel #12
0
from nltk.corpus import stopwords
import nltk

example = "Hello Mr. Holmes. How are you doing? The weather is nice Holmes and Python is amazing. I hope you like it too!"
sen_list = sent_tokenize(example)
sen = sen_list[2]
print(sen)
stop_words = set(stopwords.words('english'))
'''words = word_tokenize(sen)
filtered_words = []
for w in words:
    if w not in stop_words:                 tokenizing
        filtered_words.append(w)
print(filtered_words)
'''
tokenize = PunktSentenceTokenizer(sen)
tokenized = tokenize.tokenize(sen)  # Speech tagging
print(tokenized)
for i in tokenized:
    words = word_tokenize(i)
    tagged = nltk.pos_tag(words)
    # Chunking
    '''
    using regex  here . means select all characters
    ? means atleast 1 repetation.. for further info see tutorial on pythonprogrammong.net
    '''
    chunkgram = r"""Chunk: {<RB.?>*<VB.?>*<NNP><NN>?} """  # RB,VB,NNp etc are tags like VB=verb... what we are doing here is selecting certain type of words in chunk
    chunkparser = nltk.RegexpParser(chunkgram)
    chunked = chunkparser.parse(tagged)
    print(chunked)
Beispiel #13
0
from nltk import PunktSentenceTokenizer, WordPunctTokenizer
from collections import Counter

vocab_size = 1000

sentTokenier = PunktSentenceTokenizer()
wordTokenizer = WordPunctTokenizer()

filename = 'data/formatted_movie_lines.txt'
string = open(filename, mode='r', encoding='utf8').read()
string = string.replace("'t", "")
string = string.replace("'s", "")

words = wordTokenizer.tokenize(string)
sentences = set(sentTokenier.tokenize(string))

vocab = Counter(words).most_common(vocab_size)
dict = Counter(vocab)
sentences = [wordTokenizer.tokenize(sentence) for sentence in sentences]

new_sentences = []
with open("lines.txt", mode='w', encoding='utf8') as file:
    for sentence in sentences:
        write = True
        for word in sentence:
            if word in dict.keys():
                write = False
                break
        if write:
            file.writelines(" ".join(sentence) + "\n")
            new_sentences.append(sentence)
Beispiel #14
0
 def __init__(self):
     super().__init__()
     self.sent_splitter = PunktSentenceTokenizer()
#Representing the words with their Parts of Speech
import nltk
from nltk.corpus import state_union
''' PunktSentenceTokenizer is unsupervised ml sentence tokenizer 
It comes with pretraining and we can also further train it '''
from nltk import PunktSentenceTokenizer

train = state_union.raw("2005-GWBush.txt")
text = state_union.raw("2006-GWBush.txt")
SentenceTokenizer = PunktSentenceTokenizer(train)

tokenized = SentenceTokenizer.tokenize(text)


def process():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
    except Exception as e:
        print(str(e))


process()
    def __init__(self, stop_words, vector_len=1000):
        self.vocab = []
        self.stop_words = stop_words
        self.vector_len = vector_len

        self.tokenizer = PunktSentenceTokenizer()
Beispiel #17
0
class McChineseTokenizer(object):
    """Chinese language tokenizer that uses jieba."""

    # Path to jieba dictionary(ies)
    __dict_path = os.path.join(mc_root_path(),
                               'lib/MediaWords/Languages/resources/zh/')
    __jieba_dict_path = os.path.join(__dict_path, 'dict.txt.big')
    __jieba_userdict_path = os.path.join(__dict_path, 'userdict.txt')

    # jieba instance
    __jieba = None

    # Text -> sentence tokenizer for Chinese text
    __chinese_sentence_tokenizer = RegexpTokenizer(
        r'([^!?。]*[!?。])',
        gaps=True,  # don't discard non-Chinese text
        discard_empty=True,
    )

    # Text -> sentence tokenizer for non-Chinese (e.g. English) text
    __non_chinese_sentence_tokenizer = PunktSentenceTokenizer()

    def __init__(self):
        """Initialize jieba tokenizer."""

        self.__jieba = JiebaTokenizer()

        if not os.path.isdir(self.__dict_path):
            raise McChineseTokenizerException("""
                jieba dictionary directory was not found: %s
                Maybe you forgot to initialize Git submodules?
                """ % self.__dict_path)

        if not os.path.isfile(self.__jieba_dict_path):
            raise McChineseTokenizerException("""
                Default dictionary not found in jieba dictionary directory: %s
                Maybe you forgot to run jieba installation script?
                """ % self.__dict_path)
        if not os.path.isfile(self.__jieba_userdict_path):
            raise McChineseTokenizerException("""
                User dictionary not found in jieba dictionary directory: %s
                Maybe you forgot to run jieba installation script?
                """ % self.__dict_path)
        try:
            # loading dictionary is part of the init process
            self.__jieba.set_dictionary(os.path.join(self.__jieba_dict_path))
            self.__jieba.load_userdict(os.path.join(
                self.__jieba_userdict_path))
        except Exception as ex:
            raise McChineseTokenizerException(
                "Unable to initialize jieba: %s" % str(ex))

    def tokenize_text_to_sentences(self, text: str) -> list:
        """Tokenize Chinese text into sentences."""

        text = decode_object_from_bytes_if_needed(text)

        if text is None:
            log.warning("Text to tokenize into sentences is None.")
            return []

        text = text.strip()

        if len(text) == 0:
            return []

        # First split Chinese text
        chinese_sentences = self.__chinese_sentence_tokenizer.tokenize(text)
        sentences = []
        for sentence in chinese_sentences:

            # Split paragraphs separated by two line breaks denoting a list
            paragraphs = re.split("\n\s*?\n", sentence)
            for paragraph in paragraphs:

                # Split lists separated by "* "
                list_items = re.split("\n\s*?(?=\* )", paragraph)
                for list_item in list_items:
                    # Split non-Chinese text
                    non_chinese_sentences = self.__non_chinese_sentence_tokenizer.tokenize(
                        list_item)

                    sentences += non_chinese_sentences

        # Trim whitespace
        sentences = [sentence.strip() for sentence in sentences]

        return sentences

    def tokenize_sentence_to_words(self, sentence: str) -> list:
        """Tokenize Chinese sentence into words.
        
        Removes punctuation."""

        sentence = decode_object_from_bytes_if_needed(sentence)

        if sentence is None:
            log.warning("Sentence to tokenize into words is None.")
            return []

        sentence = sentence.strip()

        if len(sentence) == 0:
            return []

        parsed_text = self.__jieba.lcut(sentence, cut_all=False)
        parsed_tokens = [x for x in parsed_text if x.strip()]
        words = []
        for parsed_token in parsed_tokens:
            if re.search(r'\w+', parsed_token, flags=re.UNICODE) is not None:
                words.append(parsed_token)
            else:
                pass
        return words
Beispiel #18
0
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 11 12:49:39 2020

@author: alex.a.murray
"""
import nltk 
from nltk.corpus import state_union
from nltk import PunktSentenceTokenizer


train_text = state_union.raw("2005-GWBush.text")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)


def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
            
    except Exception as e:
            print(str(e))
            
process_content
Beispiel #19
0
sentences = sent_tokenize(example_text)

for w in words:
    print(w)

print()

for s in sentences:
    print(s)

print()

# Using PunktSentenceTokenizer and training it
train_text = state_union.raw("2005-GWBush.txt")

custom_sentence_tokenizer_trained = PunktSentenceTokenizer(train_text)

sentences = custom_sentence_tokenizer_trained.tokenize(example_text)

for s in sentences:
    print(s)

print()

# Using PunktSentenceTokenizer with no training (it comes pretrained)
custom_sentence_tokenizer_untrained = PunktSentenceTokenizer()

sentences = custom_sentence_tokenizer_untrained.tokenize(example_text)

for s in sentences:
    print(s)
Beispiel #20
0
 def __init__(self):
     self.parser = CoreNLPDependencyParser(url=self.corenlp_server())
     self.sentence_tokenizer = PunktSentenceTokenizer()
Beispiel #21
0
class McJapaneseTokenizer(object):
    """Japanese language tokenizer that uses MeCab."""

    # Paths where mecab-ipadic-neologd might be located
    __MECAB_DICTIONARY_PATHS = [

        # Ubuntu / Debian
        '/var/lib/mecab/dic/ipadic-neologd',

        # CentOS / Fedora
        '/usr/lib64/mecab/dic/ipadic-neologd/',

        # OS X
        '/usr/local/opt/mecab-ipadic-neologd/lib/mecab/dic/ipadic-neologd/',
    ]

    # MeCab instance
    __mecab = None

    # Text -> sentence tokenizer for Japanese text
    __japanese_sentence_tokenizer = RegexpTokenizer(
        r'([^!?。]*[!?。])',
        gaps=True,  # don't discard non-Japanese text
        discard_empty=True,
    )

    # Text -> sentence tokenizer for non-Japanese (e.g. English) text
    __non_japanese_sentence_tokenizer = PunktSentenceTokenizer()

    __MECAB_TOKEN_POS_SEPARATOR = random_string(
        length=16)  # for whatever reason tab doesn't work
    __MECAB_EOS_MARK = 'EOS'

    def __init__(self):
        """Initialize MeCab tokenizer."""

        mecab_dictionary_path = McJapaneseTokenizer._mecab_ipadic_neologd_path(
        )

        try:
            self.__mecab = MeCab.Tagger(
                '--dicdir=%(dictionary_path)s '
                '--node-format=%%m%(token_pos_separator)s%%h\\n '
                '--eos-format=%(eos_mark)s\\n' % {
                    'token_pos_separator': self.__MECAB_TOKEN_POS_SEPARATOR,
                    'eos_mark': self.__MECAB_EOS_MARK,
                    'dictionary_path': mecab_dictionary_path,
                })
        except Exception as ex:
            raise McJapaneseTokenizerException(
                "Unable to initialize MeCab: %s" % str(ex))

    @staticmethod
    def _mecab_ipadic_neologd_path(
    ) -> str:  # (protected and not private because used by the unit test)
        """Return path to mecab-ipadic-neologd dictionary installed on system."""
        mecab_dictionary_path = None
        candidate_paths = McJapaneseTokenizer.__MECAB_DICTIONARY_PATHS

        for candidate_path in candidate_paths:
            if os.path.isdir(candidate_path):
                if os.path.isfile(os.path.join(candidate_path, 'sys.dic')):
                    mecab_dictionary_path = candidate_path
                    break

        if mecab_dictionary_path is None:
            raise McJapaneseTokenizerException(
                "mecab-ipadic-neologd was not found in paths: %s" %
                str(candidate_paths))

        return mecab_dictionary_path

    def tokenize_text_to_sentences(self, text: str) -> list:
        """Tokenize Japanese text into sentences."""

        text = decode_object_from_bytes_if_needed(text)

        if text is None:
            log.warning("Text to tokenize into sentences is None.")
            return []

        text = text.strip()

        if len(text) == 0:
            return []

        # First split Japanese text
        japanese_sentences = self.__japanese_sentence_tokenizer.tokenize(text)
        sentences = []
        for sentence in japanese_sentences:

            # Split paragraphs separated by two line breaks denoting a list
            paragraphs = re.split("\n\s*?\n", sentence)
            for paragraph in paragraphs:

                # Split lists separated by "* "
                list_items = re.split("\n\s*?(?=\* )", paragraph)
                for list_item in list_items:
                    # Split non-Japanese text
                    non_japanese_sentences = self.__non_japanese_sentence_tokenizer.tokenize(
                        list_item)

                    sentences += non_japanese_sentences

        # Trim whitespace
        sentences = [sentence.strip() for sentence in sentences]

        return sentences

    @staticmethod
    def _mecab_allowed_pos_ids() -> Dict[int, str]:
        """Return allowed MeCab part-of-speech IDs and their definitions from pos-id.def.
        
        Definitions don't do much in the language module itself, they're used by unit tests to verify that pos-id.def
        didn't change in some unexpected way and we're not missing out on newly defined POSes.
        """
        return {
            36: '名詞,サ変接続,*,*',  # noun-verbal
            38: '名詞,一般,*,*',  # noun
            40: '名詞,形容動詞語幹,*,*',  # adjectival nouns or quasi-adjectives
            41: '名詞,固有名詞,一般,*',  # proper nouns
            42: '名詞,固有名詞,人名,一般',  # proper noun, names of people
            43: '名詞,固有名詞,人名,姓',  # proper noun, first name
            44: '名詞,固有名詞,人名,名',  # proper noun, last name
            45: '名詞,固有名詞,組織,*',  # proper noun, organization
            46: '名詞,固有名詞,地域,一般',  # proper noun in general
            47: '名詞,固有名詞,地域,国',  # proper noun, country name
        }

    def tokenize_sentence_to_words(self, sentence: str) -> list:
        """Tokenize Japanese sentence into words.
        
        Removes punctuation and words that don't belong to part-of-speech whitelist."""

        sentence = decode_object_from_bytes_if_needed(sentence)

        if sentence is None:
            log.warning("Sentence to tokenize into words is None.")
            return []

        sentence = sentence.strip()

        if len(sentence) == 0:
            return []

        parsed_text = self.__mecab.parse(sentence).strip()
        parsed_tokens = parsed_text.split("\n")

        allowed_pos_ids = self._mecab_allowed_pos_ids()

        words = []
        for parsed_token_line in parsed_tokens:
            if self.__MECAB_TOKEN_POS_SEPARATOR in parsed_token_line:

                primary_form_and_pos_number = parsed_token_line.split(
                    self.__MECAB_TOKEN_POS_SEPARATOR)

                primary_form = primary_form_and_pos_number[0]
                pos_number = primary_form_and_pos_number[1]

                if pos_number.isdigit():
                    pos_number = int(pos_number)

                    if pos_number in allowed_pos_ids:
                        words.append(primary_form)

            else:
                # Ignore all the "EOS" stuff
                pass

        return words
Beispiel #22
0
PATTERN = re.compile(
    r'''(?x)   # set flag to allow verbose regexps: ignores spaces and newlines
        (?:[A-Z]\.)+            # abbreviations, e.g. U.S.A.
        | \$?\d+(?:\.\d+)?%?    # currency and percentages, e.g. $12.40, 82%
        | '(?:s|nt)\b           # 's, 'nt
        #     | \w+(?:-\w+)*    # words with optional internal hyphens
        | [a-zA-Z0-9]+          # words without internal hyphens e.g. Type1_gene(Attention: \w contains _)
        | \.\.\.                # ellipsis
        | [.,;"?:]              # these are separate tokens
        | [][()_`\|\n-]+        # these tokens are grouped; includes ] and [ and -
        | [^a-zA-Z0-9_\s]       # find these character in the AGAC training data.
        ''')

# default tokenizer
word_tokenizer = RegexpTokenizer(PATTERN)
sent_tokenizer = PunktSentenceTokenizer()


class Text(object):
    """
    Abstract text. e.g a paragrah, abstract and an essay.
    """
    def __init__(self,
                 text,
                 s_tokenizer=sent_tokenizer,
                 w_tokenizer=word_tokenizer):
        """
        :param text: str
        :param s_tokenizer: sentence_tokenzier
        :param w_tokenizer: word_tokenizer
        """
Beispiel #23
0
def _load_model():
    global sentence_tokenizer
    if sentence_tokenizer is not None:
        return
    model_path = join(dirname(__file__), "st_kiss-strunk-2006_2019_01_13.pkl")
    with open(model_path, "rb") as fs:
        punkt_param = pickle.load(fs)

    punkt_param.sent_starters = {}
    abbrev_types = [
        "g.m.t",
        "e.g",
        "dr",
        "dr",
        "vs",
        "000",
        "mr",
        "mrs",
        "prof",
        "inc",
        "tp",
        "tp." "ts",
        "ths",
        "th",
        "vs",
        "tp",
        "k.l",
        "a.w.a.k.e",
        "t",
        "a.i",
        "</i",
        "g.w",
        "ass",
        "u.n.c.l.e",
        "t.e.s.t",
        "ths",
        "d.c",
        "ve…",
        "ts",
        "f.t",
        "b.b",
        "z.e",
        "s.g",
        "m.p",
        "g.u.y",
        "l.c",
        "g.i",
        "j.f",
        "r.r",
        "v.i",
        "m.h",
        "a.s",
        "bs",
        "c.k",
        "aug",
        "t.d.q",
        "b…",
        "ph",
        "j.k",
        "e.l",
        "o.t",
        "s.a",
    ]
    abbrev_types.extend(string.ascii_uppercase)
    for abbrev_type in abbrev_types:
        punkt_param.abbrev_types.add(abbrev_type)
    for abbrev_type in string.ascii_lowercase:
        punkt_param.abbrev_types.add(abbrev_type)
    sentence_tokenizer = PunktSentenceTokenizer(punkt_param)