Example #1
0
 def test_split_regular(self):
     result = split_contractions(["We'll", 'see', "her's", 'too', '!'])
     self.assertEqual(7, len(result), str(result))
     self.assertEqual(result[0], 'We', str(result))
     self.assertEqual(result[1], "'ll", str(result))
     self.assertEqual(result[3], 'her', str(result))
     self.assertEqual(result[4], "'s", str(result))
Example #2
0
 def test_split_regular(self):
     result = split_contractions(["We'll", 'see', "her's", 'too', '!'])
     self.assertEqual(7, len(result), str(result))
     self.assertEqual(result[0], 'We', str(result))
     self.assertEqual(result[1], "'ll", str(result))
     self.assertEqual(result[3], 'her', str(result))
     self.assertEqual(result[4], "'s", str(result))
 def __build_graph__(self):
     stopwords = get_stopwords(self.lan)
     stem = get_stem(self.lan).stem
     self.G = nx.Graph()
     sentences_str = [[
         w for w in split_contractions(web_tokenizer(s))
         if not (w.startswith("'") and len(w) > 1) and len(w) > 0
     ] for s in list(split_multi(self.text)) if len(s.strip()) > 0]
     for sentence in sentences_str:
         buffer = []
         for word in sentence:
             if len([
                     c for c in word if c in EXCLUDE
             ]) == len(word) or word.lower() in stopwords or word.replace(
                     '.', '').replace(',', '').replace('-', '').isnumeric():
                 continue
             else:
                 #stemmed_word = lemma(word).lower()
                 stemmed_word = stem(word)
                 if stemmed_word not in self.G:
                     self.G.add_node(stemmed_word, TF=0)
                 self.G.node[stemmed_word]['TF'] += 1
                 for (idx_cooccur,
                      word_cooccur) in enumerate(buffer[-self.w:]):
                     self.__add_cooccur__(word_cooccur, stemmed_word,
                                          idx_cooccur + 1)
                 buffer.append(stemmed_word)
     self.__build_linegraph__()
Example #4
0
    def add_document(self, text):
        text = self.pre_filter(text)
        sentences_str = [[
            w for w in split_contractions(web_tokenizer(s))
            if not (w.startswith("'") and len(w) > 1) and len(w) > 0
        ] for s in list(split_multi(text)) if len(s.strip()) > 0]
        self.number_of_sentences += len(sentences_str)
        self.number_of_documents += 1
        pos_text = 0
        document_candidates = {}
        term_in_doc = {}
        sentences_obj = []
        block_of_word_obj = []
        sentence_obj_aux = []
        for (sentence_id, sentence) in enumerate(sentences_str):
            sentence_obj_aux = []
            block_of_word_obj = []
            for (pos_sent, word) in enumerate(sentence):
                if len([
                        c for c in word if c in self.exclude
                ]) == len(word):  # If the word is based on exclude chars
                    if len(block_of_word_obj) > 0:
                        sentence_obj_aux.append(block_of_word_obj)
                        cand = ComposedWord(block_of_word_obj)
                        cand = self.add_or_update_composed_word(cand)
                        if cand.unique_kw not in document_candidates:
                            document_candidates[cand.unique_kw] = cand
                        block_of_word_obj = []
                else:
                    tag = self.get_tag(word, pos_sent)
                    term_obj = self.get_term(word)
                    term_in_doc[term_obj.unique_term] = term_obj
                    term_obj.add_occurrence(tag, sentence_id, pos_sent,
                                            pos_text, self.number_of_documents)
                    pos_text += 1
                    #Create co-occurrence matrix
                    if tag not in self.tagsToDiscard:
                        word_windows = list(
                            range(
                                max(0,
                                    len(block_of_word_obj) - self.windowsSize),
                                len(block_of_word_obj)))
                        for w in word_windows:
                            if block_of_word_obj[w][
                                    0] not in self.tagsToDiscard:
                                self.add_cooccurrence(block_of_word_obj[w][2],
                                                      term_obj)

                    # Add term to the block of words' buffer
                    block_of_word_obj.append((tag, word, term_obj))
            if len(block_of_word_obj) > 0:
                sentence_obj_aux.append(block_of_word_obj)
            if len(sentence_obj_aux) > 0:
                sentences_obj.append(sentence_obj_aux)
        if len(block_of_word_obj) > 0:
            sentence_obj_aux.append(block_of_word_obj)
        if len(sentence_obj_aux) > 0:
            sentences_obj.append(sentence_obj_aux)
        self.number_of_words += pos_text
        return document_candidates, term_in_doc
Example #5
0
    def __init__(self,
                 text: str = None,
                 use_tokenizer: bool = False,
                 labels: List[str] = None):

        self.tokens: List[Token] = []

        self.labels: List[str] = labels

        self._embeddings: Dict = {}

        # optionally, directly instantiate with sentence tokens
        if text is not None:

            # tokenize the text first if option selected, otherwise assumes whitespace tokenized text
            if use_tokenizer:
                sentences = split_single(text)
                tokens = []
                for sentence in sentences:
                    contractions = split_contractions(word_tokenizer(sentence))
                    tokens.extend(contractions)

                text = ' '.join(tokens)

            # add each word in tokenized string as Token object to Sentence
            for word in text.split(' '):
                self.add_token(Token(word))
Example #6
0
    def _build(self, text, windowsSize, n):
        text = self.pre_filter(text)
        self.sentences_str = [ [w for w in split_contractions(web_tokenizer(s)) if not (w.startswith("'") and len(w) > 1) and len(w) > 0] for s in list(split_multi(text)) if len(s.strip()) > 0]
        self.number_of_sentences = len(self.sentences_str)
        pos_text = 0
        block_of_word_obj = []
        sentence_obj_aux = []
        for (sentence_id, sentence) in enumerate(self.sentences_str):
            sentence_obj_aux = []
            block_of_word_obj = []
            for (pos_sent, word) in enumerate(sentence):
                if len([c for c in word if c in self.exclude]) == len(word): # If the word is based on exclude chars
                    if len(block_of_word_obj) > 0:
                        sentence_obj_aux.append( block_of_word_obj )
                        block_of_word_obj = []
                else:
                    tag = self.getTag(word, pos_sent)
                    term_obj = self.getTerm(word)
                    term_obj.addOccur(tag, sentence_id, pos_sent, pos_text)
                    pos_text += 1

                    #Create co-occurrence matrix
                    if tag not in self.tagsToDiscard:
                        word_windows = list(range( max(0, len(block_of_word_obj)-windowsSize), len(block_of_word_obj) ))
                        for w in word_windows:
                            if block_of_word_obj[w][0] not in self.tagsToDiscard: 
                                self.addCooccur(block_of_word_obj[w][2], term_obj)
                    #Generate candidate keyphrase list
                    candidate = [ (tag, word, term_obj) ]
                    cand = composed_word(candidate)
                    self.addOrUpdateComposedWord(cand)
                    word_windows = list(range( max(0, len(block_of_word_obj)-(n-1)), len(block_of_word_obj) ))[::-1]
                    for w in word_windows:
                        candidate.append(block_of_word_obj[w])
                        self.freq_ns[len(candidate)] += 1.
                        cand = composed_word(candidate[::-1])
                        self.addOrUpdateComposedWord(cand)

                    # Add term to the block of words' buffer
                    block_of_word_obj.append( (tag, word, term_obj) )

            if len(block_of_word_obj) > 0:
                sentence_obj_aux.append( block_of_word_obj )

            if len(sentence_obj_aux) > 0:
                self.sentences_obj.append(sentence_obj_aux)

        if len(block_of_word_obj) > 0:
            sentence_obj_aux.append( block_of_word_obj )

        if len(sentence_obj_aux) > 0:
            self.sentences_obj.append(sentence_obj_aux)

        self.number_of_words = pos_text
def tokenize(text):
    """
    Inputs: txt
    Outputs: tokens tokenized by segtok.tokenizer
    """
    tokens = []
    sentences = split_single(text)
    for sentence in sentences:
        contractions = split_contractions(word_tokenizer(sentence))
        tokens.extend(contractions)
    return tokens
Example #8
0
    def run_tokenize(text: str) -> List[str]:
        words: List[str] = []

        sentences = split_single(text)
        for sentence in sentences:
            contractions = split_contractions(word_tokenizer(sentence))
            words.extend(contractions)

        words = list(filter(None, words))

        return words
Example #9
0
    def __init__(self,
                 text: str = None,
                 use_tokenizer: bool = False,
                 labels: Union[List[Label], List[str]] = None):

        super(Sentence, self).__init__()

        self.tokens: List[Token] = []

        self.labels: List[Label] = []
        if labels is not None: self.add_labels(labels)

        self._embeddings: Dict = {}

        # if text is passed, instantiate sentence with tokens (words)
        if text is not None:

            # tokenize the text first if option selected
            if use_tokenizer:

                # use segtok for tokenization
                tokens = []
                sentences = split_single(text)
                for sentence in sentences:
                    contractions = split_contractions(word_tokenizer(sentence))
                    tokens.extend(contractions)

                # determine offsets for whitespace_after field
                index = text.index
                running_offset = 0
                last_word_offset = -1
                last_token = None
                for word in tokens:
                    token = Token(word)
                    self.add_token(token)
                    try:
                        word_offset = index(word, running_offset)
                    except:
                        word_offset = last_word_offset + 1
                    if word_offset - 1 == last_word_offset and last_token is not None:
                        last_token.whitespace_after = False
                    word_len = len(word)
                    running_offset = word_offset + word_len
                    last_word_offset = running_offset - 1
                    last_token = token

            # otherwise assumes whitespace tokenized text
            else:
                # add each word in tokenized string as Token object to Sentence
                for word in text.split(' '):
                    if word:
                        token = Token(word)
                        self.add_token(token)
Example #10
0
 def __init__(self, text=None, use_tokenizer=False, labels=None):
     super(Sentence, self).__init__()
     self.tokens = []
     self.labels = []
     if (labels is not None):
         self.add_labels(labels)
     self._embeddings = {}
     if (text is not None):
         if use_tokenizer:
             tokens = []
             sentences = split_single(text)
             for sentence in sentences:
                 contractions = split_contractions(word_tokenizer(sentence))
                 tokens.extend(contractions)
             index = text.index
             running_offset = 0
             last_word_offset = (-1)
             last_token = None
             for word in tokens:
                 try:
                     word_offset = index(word, running_offset)
                     start_position = word_offset
                 except:
                     word_offset = (last_word_offset + 1)
                     start_position = ((running_offset + 1) if
                                       (running_offset > 0) else
                                       running_offset)
                 token = Token(word, start_position=start_position)
                 self.add_token(token)
                 if (((word_offset - 1) == last_word_offset)
                         and (last_token is not None)):
                     last_token.whitespace_after = False
                 word_len = len(word)
                 running_offset = (word_offset + word_len)
                 last_word_offset = (running_offset - 1)
                 last_token = token
         else:
             word = u''
             for (index, char) in enumerate(text):
                 if (char == u' '):
                     if (len(word) > 0):
                         token = Token(word,
                                       start_position=(index - len(word)))
                         self.add_token(token)
                     word = u''
                 else:
                     word += char
             index += 1
             if (len(word) > 0):
                 token = Token(word, start_position=(index - len(word)))
                 self.add_token(token)
Example #11
0
    def word_tokenize(self, text):
        """Get list of string tokens from input string.

        Args:
            text: input string for tokenization
        Yields:
            token: str, non-whitespace tokens
        """
        for token in split_possessive_markers(split_contractions(_html_tokenize(text))):
            if self._max_characters_per_token is not None:
                for token_chunk in funcy.chunks(self._max_characters_per_token, token):
                    yield token_chunk
            else:
                yield token
Example #12
0
 def build_candidate(self, candidate_string):
     sentences_str = [w for w in split_contractions(web_tokenizer(candidate_string.lower())) if not (w.startswith("'") and len(w) > 1) and len(w) > 0]
     candidate_terms = []
     for (i, word) in enumerate(sentences_str):
         tag = self.getTag(word, i)
         term_obj = self.getTerm(word, save_non_seen=False)
         if term_obj.tf == 0:
             term_obj = None
         candidate_terms.append( (tag, word, term_obj) )
     if len([cand for cand in candidate_terms if cand[2] != None]) == 0:
         invalid_virtual_cand = composed_word(None)
         return invalid_virtual_cand
     virtual_cand = composed_word(candidate_terms)
     return virtual_cand
Example #13
0
def fr_tokenizer(text: str) -> list:
    """
    Tokenizes texts in French
    Args:
        text (str): input text

    Returns: 
        flair Token objects
    """
    tokens = []
    tokenizer = RegexpTokenizer(r"""\w'|\w’|\w`|\w\w+'\w+|[^\w\s]|\w+""")
    words = []
    sentences = split_single(text)
    for sentence in sentences:
        contractions = split_contractions(tokenizer.tokenize(sentence))
        words.extend(contractions)

    # determine offsets for whitespace_after field
    index = text.index
    current_offset = 0
    previous_word_offset = -1
    previous_token = None
    for word in words:
        try:
            word_offset = index(word, current_offset)
            start_position = word_offset
        except ValueError:
            word_offset = previous_word_offset + 1
            start_position = (current_offset +
                              1 if current_offset > 0 else current_offset)

        if word:
            token = Token(text=word,
                          start_position=start_position,
                          whitespace_after=True)
            tokens.append(token)

        if (previous_token
                is not None) and word_offset - 1 == previous_word_offset:
            previous_token.whitespace_after = False

        current_offset = word_offset + len(word)
        previous_word_offset = current_offset - 1
        previous_token = token

    return tokens
Example #14
0
def segtok_tokenizer(text: str) -> List[Token]:
    """
    Tokenizer using segtok, a third party library dedicated to rules-based Indo-European languages.
    https://github.com/fnl/segtok
    """
    tokens: List[Token] = []

    words: List[str] = []
    sentences = split_single(text)
    for sentence in sentences:
        contractions = split_contractions(word_tokenizer(sentence))
        words.extend(contractions)

    words = list(filter(None, words))

    # determine offsets for whitespace_after field
    index = text.index
    current_offset = 0
    previous_word_offset = -1
    previous_token = None
    for word in words:
        #try:
        word_offset = index(word, current_offset)
        start_position = word_offset
        #except:
        #    word_offset = previous_word_offset + 1
        #    start_position = (
        #        current_offset + 1 if current_offset > 0 else current_offset
        #    )

        if word:
            token = Token(text=word,
                          start_position=start_position,
                          whitespace_after=True)
            tokens.append(token)

        if (previous_token
                is not None) and word_offset - 1 == previous_word_offset:
            previous_token.whitespace_after = False

        current_offset = word_offset + len(word)
        previous_word_offset = current_offset - 1
        previous_token = token

    return tokens
Example #15
0
    def run_tokenize(text: str) -> List[Token]:
        tokens: List[Token] = []
        words: List[str] = []

        sentences = split_single(text)
        for sentence in sentences:
            contractions = split_contractions(word_tokenizer(sentence))
            words.extend(contractions)

        words = list(filter(None, words))

        # determine offsets for whitespace_after field
        index = text.index
        current_offset = 0
        previous_word_offset = -1
        previous_token = None
        for word in words:
            try:
                word_offset = index(word, current_offset)
                start_position = word_offset
            except:
                word_offset = previous_word_offset + 1
                start_position = (current_offset +
                                  1 if current_offset > 0 else current_offset)

            if word:
                token = Token(text=word,
                              start_position=start_position,
                              whitespace_after=True)
                tokens.append(token)

            if (previous_token
                    is not None) and word_offset - 1 == previous_word_offset:
                previous_token.whitespace_after = False

            current_offset = word_offset + len(word)
            previous_word_offset = current_offset - 1
            previous_token = token

        return tokens
Example #16
0
 def test_split_not(self):
     stem, contraction = split_contractions(["don't"])
     self.assertEqual(stem, 'do')
     self.assertEqual(contraction, "n't")
Example #17
0
 def _process_internal(self, sentences: List[str]) -> List[List[str]]:
     return [split_contractions(word_tokenizer(sen)) for sen in sentences]
Example #18
0
    def word_tokenizer(self, text) -> List[Token]:
        tokenized = []
        if self.language_type == 'zh':
            if self.sp_op == 'char':
                for index, char in enumerate(text):
                    token = Token(char, start_position=index)
                    tokenized.append(token)
            elif self.sp_op == 'py':
                for index, char in enumerate(text):
                    token = Token(char, start_position=index, sp='py')
                    tokenized.append(token)
            else:
                seg_list = list(jieba.tokenize(text))
                for t in seg_list:
                    token = Token(t[0], start_position=t[1])
                    tokenized.append(token)

        elif self.language_type == 'ug':
            text = self.uy_preprocess(text)
            word = ''
            for index, char in enumerate(text):
                if char == ' ':
                    if len(word) > 0:
                        token = Token(word, start_position=index - len(word), sp=self.sp_op)
                        tokenized.append(token)

                    word = ''
                else:
                    word += char
            index += 1
            if len(word) > 0:
                token = Token(word, start_position=index - len(word), sp=self.sp_op)
                tokenized.append(token)

        else:
            tokenized = []
            tokens = []
            sentences = split_single(text)
            for sentence in sentences:
                contractions = split_contractions(word_tokenizer(sentence))
                tokens.extend(contractions)

            index = text.index
            running_offset = 0
            last_word_offset = -1
            last_token = None
            for word in tokens:
                try:
                    word_offset = index(word, running_offset)
                    start_position = word_offset
                except:
                    word_offset = last_word_offset + 1
                    start_position = running_offset + 1 if running_offset > 0 else running_offset

                token = Token(word, start_position=start_position)
                tokenized.append(token)

                if word_offset - 1 == last_word_offset and last_token is not None:
                    last_token.whitespace_after = False

                word_len = len(word)
                running_offset = word_offset + word_len
                last_word_offset = running_offset - 1
                last_token = token

        return tokenized
def question_to_tokenized_fields(question):
    b = ['¡ Description']
    a = question.replace('¶ ¶ Examples ¶ ', '¦¶ ¶ Examples ¶ ¶ ').replace(
        '¶ Examples ¶ ', '¦¶ ¶ Examples ¶ ¶ ').split('¦')

    #You replace Note with Explanation in Codeforces

    #codeforces
    if len(a) > 1:
        for idx, i in enumerate(a):
            if idx == 0:
                c = []
                c += [
                    i.encode('utf-8') for i in segtok.segmenter.split_multi(
                        a[idx].decode('utf-8'))
                ]
                for i in c:
                    b += i.replace(
                        '¶ ¶ Description ¶ ', '¡ Description¦').replace(
                            '¶ ¶ Input ¶ ', '¦¡ Input¦').replace(
                                '¶ ¶ Output ¶ ', '¦¡ Output¦').replace(
                                    '¶ Input ¶ ', '¦¡ Input¦').replace(
                                        '¶ Output ¶ ', '¦¡ Output¦').replace(
                                            ' . ',
                                            ' .¦').replace('¶ ¶ ',
                                                           '¦').split('¦')
            else:
                c = []
                c += [
                    i.encode('utf-8') for i in segtok.segmenter.split_multi(
                        a[idx].decode('utf-8'))
                ]
                for i in c:
                    b += i.replace('¶ ¶ Input ¶ ', '¦¶ ¶ Input ¶ ').replace(
                        '¶ ¶ Examples ', '¡ Examples').replace(
                            '¶ Examples ', '¡ Examples').replace(
                                '¶ ¶ Output ¶ ', '¦¶ Output ¶ ').replace(
                                    '¶ ¶ Note ¶ ', '¦¡ Explanation¦').replace(
                                        '¶ ¶ Input : ¶', '¦¡ Input¦').replace(
                                            '¶ ¶ Output : ¶',
                                            '¦¡ Output¦').replace(
                                                ' . ', ' .¦').replace(
                                                    '¶ ¶ ', '¦').replace(
                                                        '¶ Output ¶',
                                                        'Output ¶').split('¦')

    #hackerearth
    else:
        c = []
        c += [
            i.encode('utf-8')
            for i in segtok.segmenter.split_multi(a[0].decode('utf-8'))
        ]
        for i in c:
            b+=i.replace('Description: ¶ ', '').replace('¶ ¶ Output', '¶ Output').replace('¶ Output', '¶ ¶ Output').replace('¶ ¶ Input : ¶ ', '¦¡ Input¦').replace('¶ ¶ Output : ¶ ', '¦¡ Output¦').replace('¶ ¶ Input: ¶ ', '¦¡ Input¦').replace('¶ ¶ Output: ¶ ', '¦¡ Output¦').replace('¶ ¶ Input ¶ ', '¦¡ Input¦').replace('¶ ¶ Output ¶ ', '¦¡ Output¦').replace('¶ ¶ Input ', '¦¡ Input¦').replace('¶ ¶ Examples ', '¡ Examples').replace('¶ ¶ Output ', '¦¡ Output¦').replace('¶ ¶ Note ¶ ', '¦¡ Note¦') \
            .replace('¶ ¶ SAMPLE INPUT ¶', '¦¡ Examples¦¶ ¶ Input ¶').replace('¶ ¶ SAMPLE OUTPUT ¶', '¦¶ ¶ Output ¶').replace('¶ ¶ Constraints : ¶ ', '¦¡ Constraints¦').replace('¶ ¶ Constraint : ¶ ', '¦¡ Constraints¦').replace('¶ ¶ Constraints: ¶ ', '¦¡ Constraints¦').replace('¶ ¶ Constraint: ¶ ', '¦¡ Constraints¦').replace('¶ ¶ Constraints ¶ ', '¦¡ Constraints¦').replace('¶ ¶ Constraint ¶ ', '¦¡ Constraints¦').replace('¶ ¶ Explanation ¶ ', '¦¡ Explanation¦').replace('¶ ¶ ', '¦').split('¦')

    b = [
        split_nums(split_contractions(char_split_if_io_example(x))) for x in b
        if x.strip()
    ]
    return b
Example #20
0
 def test_split_unicode(self):
     stem, contraction = split_contractions(["a\u2032d"])
     self.assertEqual(stem, 'a')
     self.assertEqual(contraction, "\u2032d")
Example #21
0
 def test_split_not(self):
     stem, contraction = split_contractions(["don't"])
     self.assertEqual(stem, 'do')
     self.assertEqual(contraction, "n't")
Example #22
0
 def __init__(self,
              text: str = None,
              use_tokenizer: bool = False,
              labels: Union[(List[Label], List[str])] = None,
              language_code: str = None):
     super(Sentence, self).__init__()
     self.tokens = []
     self.labels = []
     if (labels is not None):
         self.add_labels(labels)
     self._embeddings = {}
     self.language_code = language_code
     if (text is not None):
         if use_tokenizer:
             tokens = []
             sentences = split_single(text)
             for sentence in sentences:
                 contractions = split_contractions(word_tokenizer(sentence))
                 tokens.extend(contractions)
             index = text.index
             running_offset = 0
             last_word_offset = (-1)
             last_token = None
             for word in tokens:
                 try:
                     word_offset = index(word, running_offset)
                     start_position = word_offset
                 except:
                     word_offset = (last_word_offset + 1)
                     start_position = ((running_offset + 1) if
                                       (running_offset > 0) else
                                       running_offset)
                 token = Token(word, start_position=start_position)
                 self.add_token(token)
                 if (((word_offset - 1) == last_word_offset)
                         and (last_token is not None)):
                     last_token.whitespace_after = False
                 word_len = len(word)
                 running_offset = (word_offset + word_len)
                 last_word_offset = (running_offset - 1)
                 last_token = token
         else:
             word = ''
             index = (-1)
             for (index, char) in enumerate(text):
                 if (char == ' '):
                     if (len(word) > 0):
                         token = Token(word,
                                       start_position=(index - len(word)))
                         self.add_token(token)
                     word = ''
                 else:
                     word += char
             index += 1
             if (len(word) > 0):
                 token = Token(word, start_position=(index - len(word)))
                 self.add_token(token)
     if (text == ''):
         log.warn(
             'ACHTUNG: An empty Sentence was created! Are there empty strings in your dataset?'
         )
     self.tokenized = None
Example #23
0
 def __nltk_stem__(self, word):
     return ' '.join([
         self.stem.stem(w) for w in split_contractions(web_tokenizer(word))
     ])
Example #24
0
 def __polish_stem__(self, word):
     return ' '.join(
         self.stem.stemmer_convert(
             [w for w in split_contractions(web_tokenizer(word))]))
Example #25
0
 def setUp(self):
     self.tokenizer = test_tokenizer_with_spans(
         self, lambda t: split_contractions(space_tokenizer(t)))
Example #26
0
    def __init__(
        self,
        text: str = None,
        use_tokenizer: bool = False,
        labels: Union[List[Label], List[str]] = None,
        language_code: str = None,
    ):

        super(Sentence, self).__init__()

        self.tokens: List[Token] = []

        self.labels: List[Label] = []
        if labels is not None:
            self.add_labels(labels)

        self._embeddings: Dict = {}

        self.language_code: str = language_code

        # if text is passed, instantiate sentence with tokens (words)
        if text is not None:

            # tokenize the text first if option selected
            if use_tokenizer:

                # use segtok for tokenization
                tokens = []
                sentences = split_single(text)
                for sentence in sentences:
                    contractions = split_contractions(word_tokenizer(sentence))
                    tokens.extend(contractions)

                # determine offsets for whitespace_after field
                index = text.index
                running_offset = 0
                last_word_offset = -1
                last_token = None
                for word in tokens:
                    try:
                        word_offset = index(word, running_offset)
                        start_position = word_offset
                    except:
                        word_offset = last_word_offset + 1
                        start_position = (running_offset +
                                          1 if running_offset > 0 else
                                          running_offset)

                    token = Token(word, start_position=start_position)
                    self.add_token(token)

                    if word_offset - 1 == last_word_offset and last_token is not None:
                        last_token.whitespace_after = False

                    word_len = len(word)
                    running_offset = word_offset + word_len
                    last_word_offset = running_offset - 1
                    last_token = token

            # otherwise assumes whitespace tokenized text
            else:
                # add each word in tokenized string as Token object to Sentence
                word = ""
                index = -1
                for index, char in enumerate(text):
                    if char == " ":
                        if len(word) > 0:
                            token = Token(word,
                                          start_position=index - len(word))
                            self.add_token(token)

                        word = ""
                    else:
                        word += char
                # increment for last token in sentence if not followed by whtespace
                index += 1
                if len(word) > 0:
                    token = Token(word, start_position=index - len(word))
                    self.add_token(token)

        # log a warning if the dataset is empty
        if text == "":
            log.warn(
                "ACHTUNG: An empty Sentence was created! Are there empty strings in your dataset?"
            )
Example #27
0
    def __init__(self, text: str = None, use_tokenizer: bool = False, labels: Union[List[Label], List[str]] = None):

        super(Sentence, self).__init__()

        self.tokens: List[Token] = []

        self.labels: List[Label] = []
        if labels is not None: self.add_labels(labels)

        self._embeddings: Dict = {}

        # if text is passed, instantiate sentence with tokens (words)
        if text is not None:

            # tokenize the text first if option selected
            if use_tokenizer:

                # use segtok for tokenization
                tokens = []
                sentences = split_single(text)
                for sentence in sentences:
                    contractions = split_contractions(word_tokenizer(sentence))
                    tokens.extend(contractions)

                # determine offsets for whitespace_after field
                index = text.index
                running_offset = 0
                last_word_offset = -1
                last_token = None
                for word in tokens:
                    try:
                        word_offset = index(word, running_offset)
                        start_position = word_offset
                    except:
                        word_offset = last_word_offset + 1
                        start_position = running_offset + 1 if running_offset > 0 else running_offset

                    token = Token(word, start_position=start_position)
                    self.add_token(token)

                    if word_offset - 1 == last_word_offset and last_token is not None:
                        last_token.whitespace_after = False

                    word_len = len(word)
                    running_offset = word_offset + word_len
                    last_word_offset = running_offset - 1
                    last_token = token

            # otherwise assumes whitespace tokenized text
            else:
                # catch the empty string case
                if not text:
                    raise ValueError("Cannot convert empty string to a Sentence object.")
                # add each word in tokenized string as Token object to Sentence
                word = ''
                for index, char in enumerate(text):
                    if char == ' ':
                        if len(word) > 0:
                            token = Token(word, start_position=index-len(word))
                            self.add_token(token)

                        word = ''
                    else:
                        word += char
                # increment for last token in sentence if not followed by whtespace
                index += 1
                if len(word) > 0:
                    token = Token(word, start_position=index-len(word))
                    self.add_token(token)
Example #28
0
 def test_split_unicode(self):
     stem, contraction = split_contractions(["a\u2032d"])
     self.assertEqual(stem, 'a')
     self.assertEqual(contraction, "\u2032d")
Example #29
0
    def __init__(self,
                 text: str = None,
                 use_tokenizer: str = 'split',
                 labels: Union[List[Label], List[str]] = None):

        super(Sentence, self).__init__()

        self.tokens: List[Token] = []

        self.labels: List[Label] = []
        if labels is not None: self.add_labels(labels)

        self._embeddings: Dict = {}

        # if text is passed, instantiate sentence with tokens (words)
        if text is not None:

            # tokenize the text first if option selected
            if use_tokenizer == 'segtok':

                # use segtok for tokenization
                tokens = []
                sentences = split_single(text)
                for sentence in sentences:
                    contractions = split_contractions(word_tokenizer(sentence))
                    tokens.extend(contractions)

                # determine offsets for whitespace_after field
                index = text.index
                running_offset = 0
                last_word_offset = -1
                last_token = None
                for word in tokens:
                    try:
                        word_offset = index(word, running_offset)
                        start_position = word_offset
                    except:
                        word_offset = last_word_offset + 1
                        start_position = running_offset + 1 if running_offset > 0 else running_offset

                    token = Token(word, start_position=start_position)
                    self.add_token(token)

                    if word_offset - 1 == last_word_offset and last_token is not None:
                        last_token.whitespace_after = False

                    word_len = len(word)
                    running_offset = word_offset + word_len
                    last_word_offset = running_offset - 1
                    last_token = token

            # otherwise assumes whitespace tokenized text
            elif use_tokenizer == 'split':
                # add each word in tokenized string as Token object to Sentence
                offset = 0
                for word in text.split(' '):
                    if word:
                        try:
                            word_offset = text.index(word, offset)
                        except:
                            word_offset = offset

                        token = Token(word, start_position=word_offset)
                        self.add_token(token)
                        offset += len(word) + 1
            elif use_tokenizer == 'toki':
                cmd = ['toki-app', '-q', '-n', '-c', 'nkjp']
                p = Popen(cmd, stdout=PIPE, stdin=PIPE, stderr=PIPE)
                stdout = p.communicate(input=text.encode('utf-8'))[0]
                offset = 0
                print(stdout.decode('utf-8').split('\n'))
                for t in stdout.decode('utf-8').split(
                        '\n')[:-2]:  #omit last two newlines
                    print('XX', t)
                    m = re.match(r'^(.*)/[tp]:(none|space|newline)', t)
                    word = m.group(1)
                    # before=m.group(2)
                    # print(word, text)
                    word_offset = text.index(word, offset)

                    token = Token(word, start_position=word_offset)
                    self.add_token(token)
                    offset = word_offset + len(word)
Example #30
0
 def __simple_filter__(self, word):
     term = word.lower()
     for p in punctuation:
         term = term.replace(p, ' ')
     term = ' '.join([w for w in split_contractions(web_tokenizer(term))])
     return term.strip()