Esempio n. 1
0
def read_sentence14_target(file_path, max_offset_len=83):
    tk = MosesTokenizer()
    with open(file_path, 'rb') as fopen:
        raw = fopen.read()
        root = etree.fromstring(raw)
        for sentence in root:
            example = dict()
            example["sentence"] = sentence.find('text').text.lower()

            # for RAN
            tokens = tk.tokenize(example['sentence'])

            terms = sentence.find('aspectTerms')
            if terms is None:
                continue
            example["aspect_sentiment"] = []
            example["left_right"] = []
            example['offset'] = []

            for c in terms:
                target = c.attrib['term'].lower()
                example["aspect_sentiment"].append(
                    (target, c.attrib['polarity']))

                # for td lstm
                left_index = int(c.attrib['from'])
                right_index = int(c.attrib['to'])
                example["left_right"].append(
                    (example['sentence'][:right_index],
                     example['sentence'][left_index:], c.attrib['polarity']))

                # for RAN
                left_word_offset = len(
                    tk.tokenize(example['sentence'][:left_index]))
                right_word_offset = len(
                    tk.tokenize(example['sentence'][right_index:]))
                token_index = list(range(len(tokens)))
                token_length = float(len(token_index))
                for i in range(len(tokens)):
                    if i < left_word_offset:
                        token_index[i] = 1 - (left_word_offset -
                                              token_index[i]) / token_length
                    elif i >= right_word_offset:
                        token_index[i] = 1 - (token_index[i] -
                                              (len(tokens) - right_word_offset)
                                              + 1) / token_length
                    else:
                        token_index[i] = 0
                token_index += [-1.] * (max_offset_len - len(tokens))
                example['offset'].append(
                    (token_index, target, c.attrib['polarity']))
            yield example
Esempio n. 2
0
class NLTKMosesTokenizer:
    """Create the Moses Tokenizer implemented by in NLTK.

    From:
        https://www.nltk.org/_modules/nltk/tokenize/moses.html
    
    Examples:
    >>> tokenizer = prenlp.tokenizer.NLTKMosesTokenizer()
    >>> tokenizer('PreNLP package provides a variety of text preprocessing tools.')
    ['PreNLP', 'package', 'provides', 'a', 'variety', 'of', 'text', 'preprocessing', 'tools', '.']
    >>> tokenizer.tokenize('PreNLP package provides a variety of text preprocessing tools.')
    ['PreNLP', 'package', 'provides', 'a', 'variety', 'of', 'text', 'preprocessing', 'tools', '.']
    """
    def __init__(self):
        try:
            from nltk.tokenize.moses import MosesTokenizer
        except Exception as ex:
            import nltk
            nltk.download('perluniprops')
            nltk.download('nonbreaking_prefixes')
        self.tokenizer = MosesTokenizer()

    def __call__(self, text: str) -> List[str]:
        return self.tokenize(text)

    def tokenize(self, text: str) -> List[str]:
        return self.tokenizer.tokenize(text, escape=False)
def process_data(sequences_text):
    load_wordvec_dict()
    t = MosesTokenizer()
    sequences = np.empty_like(sequences_text)
    num_unrecognized = 0
    unrecognized_words = {}
    for i, s in enumerate(sequences_text):
        s = clean_string(s)
        s_t = t.tokenize(s, escape=False)
        s_t = [w.lower() for w in s_t]
        for j, w in enumerate(s_t):
            try:
                s_t[j] = vocab.index(w)
            except ValueError:
                # add vocabulary item
                vocab.append(w)
                # add embeddings item
                embds.append([0] * embds_dim)
                s_t[j] = len(vocab) - 1
                num_unrecognized += 1
                unrecognized_words[w] = 1
        sequences[i] = s_t
    print("Unrecognized vectors:::", num_unrecognized)
    print("Unrecognized words:::", unrecognized_words.keys())
    print("Processing Data Finished")
    return sequences
Esempio n. 4
0
class NLTKMosesTokenizer(Component):
    """Class for splitting texts on tokens using NLTK wrapper over MosesTokenizer

    Attributes:
        escape: whether escape characters for use in html markup
        tokenizer: tokenizer instance from nltk.tokenize.moses
        detokenizer: detokenizer instance from nltk.tokenize.moses

    Args:
        escape: whether escape characters for use in html markup
    """

    def __init__(self, escape: bool=False, *args, **kwargs):
        self.escape = escape
        self.tokenizer = MosesTokenizer()
        self.detokenizer = MosesDetokenizer()

    def __call__(self, batch: List[Union[str, List[str]]]) -> List[Union[List[str], str]]:
        """Tokenize given batch of strings or detokenize given batch of lists of tokens

        Args:
            batch: list of text samples or list of lists of tokens

        Returns:
            list of lists of tokens or list of text samples
        """
        if isinstance(batch[0], str):
            return [self.tokenizer.tokenize(line, escape=self.escape) for line in batch]
        else:
            return [self.detokenizer.detokenize(line, return_str=True, unescape=self.escape)
                    for line in batch]
Esempio n. 5
0
def print_unrolled_stats(unrolled_data):
    counter = dict()
    sentiment_counter = defaultdict(int)
    length_list = []
    tk = MosesTokenizer()

    aspects = set()
    for x in unrolled_data:
        aspects.add(x['aspect'])
    for a in aspects:
        counter[a] = defaultdict(int)
    for e in unrolled_data:
        counter[e['aspect']][e['sentiment']] += 1
        length_list.append(len(tk.tokenize((e['sentence']))))
    for aspect in sorted(counter.keys()):
        total = 0
        for sentiment in sorted(counter[aspect].keys()):
            print('# {}\t\t{}:\t{}'.format(aspect, sentiment,
                                           counter[aspect][sentiment]))
            total += counter[aspect][sentiment]
            sentiment_counter[sentiment] += counter[aspect][sentiment]
        counter[aspect]['total'] = total
        print('# {}\t\t{}:\t{}'.format(aspect, 'total', total))
        print()
    print(sentiment_counter)
    return counter
Esempio n. 6
0
class SacreMosesTokenizer(object):
    """Apply the Moses Tokenizer implemented in sacremoses.

    Users of this class are required to install
    `sacremoses <https://github.com/alvations/sacremoses>`_.
    For example, one can use :samp:`pip install sacremoses`.

    .. note::
        sacremoses carries an LGPL 2.1+ license.

    Examples
    --------
    >>> tokenizer = gluonnlp.data.SacreMosesTokenizer()
    >>> tokenizer("Gluon NLP toolkit provides a suite of text processing tools.")
    ['Gluon', 'NLP', 'toolkit', 'provides', 'a', 'suite', 'of', 'text', 'processing', 'tools', '.']
    >>> tokenizer("Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools "
    ...           "zur Verfügung.")
    ['Das', 'Gluon', 'NLP-Toolkit', 'stellt', 'eine', 'Reihe', 'von', 'Textverarbeitungstools', \
'zur', 'Verf\xfcgung', '.']
    """

    def __init__(self):
        try:
            from sacremoses import MosesTokenizer
            self._tokenizer = MosesTokenizer()
        except (ImportError, TypeError) as err:
            if isinstance(err, TypeError):
                warnings.warn('The instantiation of MosesTokenizer in sacremoses is'
                              ' currently only supported in python3.'
                              ' Now try NLTKMosesTokenizer using NLTK ...')
            else:
                warnings.warn('sacremoses is not installed. '
                              'To install sacremoses, use pip install -U sacremoses'
                              ' Now try NLTKMosesTokenizer using NLTK ...')
            try:
                from nltk.tokenize.moses import MosesTokenizer
                self._tokenizer = MosesTokenizer()
            except ImportError:
                raise ImportError('NLTK is also not installed. '
                                  'You must install NLTK <= 3.2.5 in order to use the '
                                  'NLTKMosesTokenizer. You can refer to the official '
                                  'installation guide in https://www.nltk.org/install.html .')

    def __call__(self, sample, return_str=False):
        """

        Parameters
        ----------
        sample: str
            The sentence to tokenize
        return_str: bool, default False
            True: return a single string
            False: return a list of tokens

        Returns
        -------
        ret : list of strs or str
            List of tokens or tokenized text
        """
        return self._tokenizer.tokenize(sample, return_str=return_str)
def normalize_text(html):
    try:
        url_re = re.compile("https{0,1}://[^\s]+")
        url2_re = re.compile("[a-z0-9\.]+\.[a-z0-9\.]+/[^\s]*")
        space_re = re.compile("[\s]{2,}")

        html = html.encode("ascii", errors="ignore")
        text = newspaper.fulltext(html)
        
        sent = text.encode('ascii', errors='ignore')
        sent = str(sent).replace("r\\", "")
        sent = str(sent).replace("n\\", "")
        sent = str(sent).replace("\\", "")
        text = sent

        t, d = MosesTokenizer(), MosesDetokenizer()
        tokens = t.tokenize(text)
        detokens = d.detokenize(tokens)
        text = " ".join(detokens)
            # Removing URLs
        text = url_re.sub(" ", text)
        text = url2_re.sub(" ", text)
            
        # Removing multiple spacing characters
        text = space_re.sub(" ", text)

        text = text.encode("ascii", errors="ignore").decode()
        text = preProcess(text)
            # Stripping leading and trailing spaces
        text = text.strip()
        return text
    except Exception as e:
        return ""
def process_data(vocab_size, batch_size, skip_window):
    client = MongoClient()
    db = client.nyt
    collection = db["caratulas"]
    start_date = datetime(2016, 1, 1, 0, 0, 0)
    end_date = datetime(2017, 1, 1, 0, 0, 0)
    cursor = collection.find({
        "$and": [{
            "lead_paragraph": {
                "$exists": True,
                "$nin": [None]
            }
        }, {
            "pub_date": {
                "$exists": True,
                "$lt": end_date,
                "$gte": start_date
            }
        }]
    })
    articles = [x["lead_paragraph"].lower() for x in cursor]
    tokenizer = MosesTokenizer()
    articles_tok = [tokenizer.tokenize(x) for x in articles]
    flat_art = [x for article in articles_tok for x in article]
    dictionary, _ = build_vocab(flat_art, vocab_size)
    index_words = convert_words_to_index(articles_tok, dictionary)
    del flat_art  # to save memory
    del articles_tok
    single_gen = generate_sample(index_words, skip_window)
    return get_batch(single_gen, batch_size)
Esempio n. 9
0
def clean_text(raw_text, get_questions=False):
    """
    Words consist of letters or numbers
    :param raw_text: text (not divided into sentences)
    :return: list of sanitized sentences
    """
    # Tokenize text into sentences.
    raw_text = delete_parenthesis(raw_text)

    sentences = nltk.sent_tokenize(raw_text)

    #Tokenize each sentence
    sanitized_sentences = []
    for s in sentences:
        #use Moses instead of nltk.word_tokenize(s)  - better with apostrophes: cant -> (can + 't) but not (ca + 'n't)
        tokenizer = MosesTokenizer()
        s_tokens = tokenizer.tokenize(s)
        #s_tokens = nltk.word_tokenize(s)
        if (not get_questions
                and s_tokens[-1] != '?') or (get_questions
                                             and s_tokens[-1] == '?'):
            sanitized_sentences.append(sanitize(s_tokens))

    #Sanitized tokens joined using detokenizer
    detokenizer = MosesDetokenizer()
    return [
        detokenizer.detokenize(s, return_str=True) for s in sanitized_sentences
    ]
Esempio n. 10
0
class SacreMosesTokenizer(object):
    """Apply the Moses Tokenizer implemented in sacremoses.

    Users of this class are required to install
    `sacremoses <https://github.com/alvations/sacremoses>`_.
    For example, one can use :samp:`pip install sacremoses`.

    .. note::
        sacremoses carries an LGPL 2.1+ license.

    Examples
    --------
    >>> tokenizer = gluonnlp.data.SacreMosesTokenizer()
    >>> tokenizer("Gluon NLP toolkit provides a suite of text processing tools.")
    ['Gluon', 'NLP', 'toolkit', 'provides', 'a', 'suite', 'of', 'text', 'processing', 'tools', '.']
    >>> tokenizer("Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools "
    ...           "zur Verfügung.")
    ['Das', 'Gluon', 'NLP-Toolkit', 'stellt', 'eine', 'Reihe', 'von', 'Textverarbeitungstools', \
'zur', 'Verf\xfcgung', '.']
    """
    def __init__(self):
        try:
            from sacremoses import MosesTokenizer
            self._tokenizer = MosesTokenizer()
        except (ImportError, TypeError) as err:
            if isinstance(err, TypeError):
                warnings.warn('The instantiation of MosesTokenizer in sacremoses is'
                              ' currently only supported in python3.'
                              ' Now try NLTKMosesTokenizer using NLTK ...')
            else:
                warnings.warn('sacremoses is not installed. '
                              'To install sacremoses, use pip install -U sacremoses'
                              ' Now try NLTKMosesTokenizer using NLTK ...')
            try:
                from nltk.tokenize.moses import MosesTokenizer
                self._tokenizer = MosesTokenizer()
            except ImportError:
                raise ImportError('NLTK is also not installed. '
                                  'You must install NLTK <= 3.2.5 in order to use the '
                                  'NLTKMosesTokenizer. You can refer to the official '
                                  'installation guide in https://www.nltk.org/install.html .')

    def __call__(self, sample, return_str=False):
        """

        Parameters
        ----------
        sample: str
            The sentence to tokenize
        return_str: bool, default False
            True: return a single string
            False: return a list of tokens

        Returns
        -------
        ret : list of strs or str
            List of tokens or tokenized text
        """
        return self._tokenizer.tokenize(sample, return_str=return_str)
Esempio n. 11
0
class NLTKMosesTokenizer(object):
    """Apply the Moses Tokenizer implemented in NLTK.

    Users of this class are required to install `NLTK <https://www.nltk.org/install.html>`_
    and install relevant NLTK packages, such as
    :samp:`python -m nltk.downloader perluniprops nonbreaking_prefixes`.

    Examples
    --------
    >>> tokenizer = gluonnlp.data.NLTKMosesTokenizer()
    >>> tokenizer('Gluon NLP toolkit provides a suite of text processing tools.')
    ['Gluon', 'NLP', 'toolkit', 'provides', 'a', 'suite', 'of', 'text', 'processing', 'tools', '.']
    >>> tokenizer('Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools '
    ...           'zur Verfügung.')
    ['Das', 'Gluon', 'NLP-Toolkit', 'stellt', 'eine', 'Reihe', 'von', 'Textverarbeitungstools', \
'zur', 'Verf\xfcgung', '.']
    """
    def __init__(self):
        try:
            from nltk.tokenize.moses import MosesTokenizer
        except ImportError:
            warnings.warn(
                'NLTK or relevant packages are not installed. '
                'Due to the LGPL 2.1+, moses has been deprecated in NLTK since 3.3.0. '
                'You must install NLTK <= 3.2.5 in order to use the '
                'NLTKMosesTokenizer. You can refer to the official '
                'installation guide in https://www.nltk.org/install.html .'
                ' Now try SacreMosesTokenizer using sacremoses ...')
            try:
                from sacremoses import MosesTokenizer
            except ImportError:
                raise ImportError(
                    'sacremoses is also not installed. '
                    'Please use sacremoses or older nltk version, e.g. 3.2.5. '
                    'To install sacremoses, use pip install -U sacremoses')
        try:
            self._tokenizer = MosesTokenizer()
        except ValueError:
            raise ValueError(
                'The instantiation of MosesTokenizer in sacremoses is'
                ' currently only supported in python3.')

    def __call__(self, sample, return_str=False):
        """

        Parameters
        ----------
        sample: str
            The sentence to tokenize
        return_str: bool, default False
            True: return a single string
            False: return a list of tokens

        Returns
        -------
        ret : list of strs or str
            List of tokens or tokenized text
        """
        return self._tokenizer.tokenize(sample, return_str=return_str)
 def tokenize(txt, to_lower=False):
     assert isinstance(txt, str)
     tokenizer = MosesTokenizer()
     lines = txt.split('\n')
     t = [tokenizer.tokenize(line) for line in lines]
     if to_lower:
         return [[word.lower() for word in line] for line in t]
     else:
         return t
Esempio n. 13
0
class NLTKMosesTokenizer(object):
    """Apply the Moses Tokenizer implemented in NLTK.

    Users of this class are required to install `NLTK <https://www.nltk.org/install.html>`_
    and install relevant NLTK packages, such as
    :samp:`python -m nltk.downloader perluniprops nonbreaking_prefixes`.

    Examples
    --------
    >>> tokenizer = gluonnlp.data.NLTKMosesTokenizer()
    >>> tokenizer("Gluon NLP toolkit provides a suite of text processing tools.")
    ['Gluon', 'NLP', 'toolkit', 'provides', 'a', 'suite', 'of', 'text', 'processing', 'tools', '.']
    >>> tokenizer("Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools "
    ...           "zur Verfügung.")
    ['Das', 'Gluon', 'NLP-Toolkit', 'stellt', 'eine', 'Reihe', 'von', 'Textverarbeitungstools', \
'zur', 'Verf\xfcgung', '.']
    """
    def __init__(self):
        try:
            from nltk.tokenize.moses import MosesTokenizer
        except ImportError:
            warnings.warn('NLTK or relevant packages are not installed. '
                          'Due to the LGPL 2.1+, moses has been deprecated in NLTK since 3.3.0. '
                          'You must install NLTK <= 3.2.5 in order to use the '
                          'NLTKMosesTokenizer. You can refer to the official '
                          'installation guide in https://www.nltk.org/install.html .'
                          ' Now try SacreMosesTokenizer using sacremoses ...')
            try:
                from sacremoses import MosesTokenizer
            except ImportError:
                raise ImportError('sacremoses is also not installed. '
                                  'Please use sacremoses or older nltk version, e.g. 3.2.5. '
                                  'To install sacremoses, use pip install -U sacremoses')
        try:
            self._tokenizer = MosesTokenizer()
        except ValueError:
            raise ValueError('The instantiation of MosesTokenizer in sacremoses is'
                             ' currently only supported in python3.')

    def __call__(self, sample, return_str=False):
        """

        Parameters
        ----------
        sample: str
            The sentence to tokenize
        return_str: bool, default False
            True: return a single string
            False: return a list of tokens

        Returns
        -------
        ret : list of strs or str
            List of tokens or tokenized text
        """
        return self._tokenizer.tokenize(sample, return_str=return_str)
Esempio n. 14
0
class SentenceCouples(object):
    """
    Pairs of sentences tokenized at the word-level.
    """
    def __init__(self,
                 input_,
                 max_items=None,
                 max_len=30,
                 tokenize=False,
                 level='word'):
        if os.path.isdir(input_):
            if not input_.endswith('/'):
                input_ += '/'
            self.filenames = sorted(list(glob.glob(input_ + '*.txt')))
        else:
            self.filenames = [input_]

        self.max_items = max_items
        self.max_item_len = max_len
        self.processed = 0
        self.tokenize, self.tokenizer = tokenize, None
        assert level in ('word', 'char')
        self.level = level
        if self.level == 'word' and self.tokenize:
            self.tokenizer = MosesTokenizer()

    def __iter__(self):
        for filename in self.filenames:
            couple = deque(maxlen=2)
            for line in open(filename, 'r'):
                line = ' '.join(line.strip().split())
                if not line:
                    continue
                if self.level == 'words':
                    if self.tokenize:
                        try:
                            items = tuple(self.tokenizer.tokenize(line))
                        except IndexError:
                            items = None
                    else:
                        items = line.split()
                elif self.level == 'char':
                    items = tuple(line.lower())
                if items and len(items) <= self.max_item_len:
                    couple.append(items)
                    if len(couple) == 2:
                        self.processed += 1
                        yield tuple(couple)
                if self.max_items and self.processed >= self.max_items:
                    return

    def __len__(self):
        # number of triples yielded so far
        return self.processed
Esempio n. 15
0
def process_hierarchical_data(sequences):
    load_wordvec_dict()

    t = MosesTokenizer()
    processed_sequences = np.zeros_like(sequences)
    for i, seq in enumerate(sequences):
        seq = clean_string(seq)
        sentences = sent_tokenize(seq)
        for z, sent in enumerate(sentences):
            sent_t = t.tokenize(sent)
            sent_t = [w.lower() for w in sent_t]
            for j, w in enumerate(sent_t):
                try:
                    sent_t[j] = vocab.index(w)
                except ValueError:
                    # add vocabulary item
                    vocab.append(w)
                    # add embeddings item
                    embds.append([0] * embds_dim)
                    sent_t[j] = len(vocab) - 1

            sentences[z] = sent_t
        processed_sequences[i] = sentences
    seq_lengths = np.asarray(list(map(len, processed_sequences)))
    sent_lengths = np.asarray(
        [list(map(len, seq)) for seq in processed_sequences])
    sent_lengths = pad_sequences(sent_lengths, max_length_allowed=100)[0]
    print("seq_length shape: ")
    print(seq_lengths.shape)
    print(seq_lengths[0:3])
    print("sent_length shape: ")
    print(sent_lengths.shape)
    print(sent_lengths[0:3])
    print("max_sent_length")
    print(sent_lengths.max())
    max_seq_length = seq_lengths.max()
    max_sent_length = sent_lengths.max()  # weird that max returns a list

    processed_sequences = np.asarray([
        pad_sequences(seq,
                      max_length_allowed=max_sent_length,
                      length=max_sent_length,
                      padding_val=0)[0] for seq in processed_sequences
    ])
    processed_sequences = pad_sequences(processed_sequences,
                                        max_length_allowed=max_seq_length,
                                        length=max_seq_length,
                                        padding_val=np.zeros_like(
                                            processed_sequences[0])[0])[0]

    print("Processing Data Finished")
    return processed_sequences, sent_lengths, seq_lengths
Esempio n. 16
0
def tokenize_text(text):
    # Tokenizers are basically an advanced split
    tokenizer = MosesTokenizer()
    detokenizer = MosesDetokenizer()

    processed_text = tokenizer.tokenize(text)

    # Need to detokenize to get all the weird symbols back as symbols
    processed_text = detokenizer.detokenize(processed_text)

    processed_text = preprocess(processed_text)

    return " ".join(processed_text)
Esempio n. 17
0
 def char_to_token_loc_mapping(self):
     '''
     Mapping from character location in context to the corresponding token locations.
     Then, add answer start/end token index columns to the data.
         original text: self.data.context[c_i]
         tokenized text: c_tk
         token index: self.data.context_tk_index[c_i]
     '''
     nltk_tokenizer = MosesTokenizer()
     
     answer_start_token_idx_list, answer_end_token_idx_list = [], []
     for c_i, c_tk in enumerate(self.data.context_tk):
         answer_start = nltk_tokenizer.tokenize(self.data.context[c_i][self.data.answer_start[c_i]:], escape=False) # context text from the first answer token to end
         answer_end = nltk_tokenizer.tokenize(self.data.context[c_i][:self.data.answer_end[c_i]+1], escape=False) # context text from the first to end of answer token
         
         answer_start_token_idx = len(c_tk)- len(answer_start)
         answer_end_token_idx = answer_start_token_idx # initialize to start token location
 
         for i, tk in enumerate(c_tk[answer_start_token_idx:]):
             if tk == answer_end[-1]: # add to the index as many steps as it's moved to find the end of answer token
                 answer_end_token_idx += i
                 break
     
         '''
         Codes for verification:
             print(self.data.answer_text[c_i]) - Saint Bernadette Soubirous
             print(c_tk[answer_start_token_idx:answer_end_token_idx+1]) - ['Saint', 'Bernadette', 'Soubirous']
             for m in range(answer_start_token_idx, answer_end_token_idx+1): - 849 39352 39353
                 print(self.tokenizer.word_index[c_tk[m].lower()], end =' ')
             print(answer_start_token_idx, answer_end_token_idx) - 102 104
         '''
         
         pad_counts = np.count_nonzero(self.context_vector[c_i] == 0)
         
         answer_start_token_idx_list.append(answer_start_token_idx + pad_counts)
         answer_end_token_idx_list.append(answer_end_token_idx + pad_counts)
         # print(self.context_vector[c_i][answer_start_token_idx_list[c_i]:answer_end_token_idx_list[c_i]+1])
         
     return list(zip(answer_start_token_idx_list, answer_end_token_idx_list))
Esempio n. 18
0
def print_unrolled_stats_atsa(unrolled_data):
    counter = defaultdict(int)
    length_list = []
    tk = MosesTokenizer()

    for e in unrolled_data:
        counter[e['sentiment']] += 1
        length_list.append(len(tk.tokenize((e['sentence']))))

    for sentiment in sorted(counter.keys()):
        print('#{}:\t{}'.format(sentiment, counter[sentiment]))

    return counter
Esempio n. 19
0
def _process_caption(caption):
    """Processes a caption string into a list of tokenized words.

    Args:
      caption: A string caption.

    Returns:
      A list of strings; the tokenized caption.
    """
    tokenizer = MosesTokenizer()
    tokenized_caption = ["SEQUENCE_START"]
    tokenized_caption.extend(tokenizer.tokenize(caption.lower()))
    tokenized_caption.append("SEQUENCE_END")
    return tokenized_caption
def extract_data_from_db(start_year, stop_year, strategy="article"):
    print("hola")
    stopWords = set(stopwords.words('english'))
    # Posibles estrategias: article, sentence
    client = MongoClient()
    db = client.nyt
    collection = db["caratulas"]
    start_date = datetime(start_year, 1, 1, 0, 0, 0)
    end_date = datetime(stop_year, 12, 31, 23, 59, 59)
    cursor = collection.find({
        "$and": [{
            "lead_paragraph": {
                "$exists": True,
                "$nin": [None]
            }
        }, {
            "pub_date": {
                "$exists": True,
                "$lt": end_date,
                "$gte": start_date
            }
        }]
    })
    articles = [x["lead_paragraph"].lower() for x in cursor]
    if (strategy == "article"):
        tokenizer = MosesTokenizer()
        articles_tok = [[
            w for w in tokenizer.tokenize(x)
            if w not in stopWords and w.isalpha()
        ] for x in articles]
    elif (strategy == "sentence"):
        tokenizer = MosesTokenizer()
        articles_tok = [[w for w in tokenizer.tokenize(y) if w.isalpha()]
                        for x in articles for y in x.split(". ")]

    return articles_tok
def cut_words(data):
    #stopWords = set(nltk.corpus.stopwords.words('english'))
    stopwords = nltk.corpus.stopwords.words('english')
    #新增stopwords
    for i in import_stop:
        stopwords.append(i)
    #stopwords.append(':')
    moses = MosesTokenizer()
    words = moses.tokenize(data)
    wordsFiltered = []

    for w in words:
        if w not in stopwords:
            wordsFiltered.append(w)
    return (wordsFiltered)
Esempio n. 22
0
class MosesTokenizer(Tokenizer):
    def __init__(self):
        super().__init__()
        self._tokenizer = NLTKMosesTokenizer()
        self._detokenizer = MosesDetokenizer()

    def tokenize(self, sentence):
        return self._tokenizer.tokenize(sentence)

    def detokenize(self, tokens):
        """Unescape Moses punctuation tokens.

        Replaces escape sequences like &#91; with the original characters
        (such as '['), so they better align to the original text.
        """
        return [self._detokenizer.unescape_xml(t) for t in tokens]
Esempio n. 23
0
    def get_vector(self, inputs, tokenized_corpus, max_word_num, max_sequence_len):
        loader = data_loader.DataLoader(inputs)
        self.data = pd.DataFrame({'title': loader.title, 'context': loader.context, 'question':loader.question, 'answer_start':loader.answer_start, 'answer_end':loader.answer_end, 'answer_text':loader.answer_text})
            
        self.tokenizer, self.vocabulary = self.create_vocab(tokenized_corpus, max_word_num)
                            
        # tokenization & add tokens, token indexes to columns
        nltk_tokenizer = MosesTokenizer()
        vectors = []
        for i, text_column in enumerate(['context' , 'question']):
            self.data[text_column + '_tk'] = self.data[text_column].apply(lambda i: nltk_tokenizer.tokenize(i.replace('\n', '').strip(), escape=False))
        
            # token to index
            self.data[text_column+'_tk_index'] = self.tokenizer.texts_to_sequences(self.data[text_column + '_tk'].apply(lambda i: ' '.join(i)))
            
            # padding: It returns context, question vectors.
            vectors.append(pad_sequences(self.data[text_column+'_tk_index'], max_sequence_len[i]))

        return vectors
Esempio n. 24
0
class NLTKTokenizer(Component):
    def __init__(self, escape=False, *args, **kwargs):
        self.escape = escape
        self.tokenizer = MosesTokenizer()
        self.detokenizer = MosesDetokenizer()

    def __call__(self, batch, *args, **kwargs):
        if isinstance(batch[0], str):
            return [
                self.tokenizer.tokenize(line, escape=self.escape)
                for line in batch
            ]
        else:
            return [
                self.detokenizer.detokenize(line,
                                            return_str=True,
                                            unescape=self.escape)
                for line in batch
            ]
Esempio n. 25
0
def clean_text(raw_text, newline = False):
    """
    Words consist of letters or numbers
    :param raw_text: text (not divided into sentences)
    :return: list of sanitized sentences
    """
    # Tokenize text into sentences.
    sentences = nltk.sent_tokenize(raw_text)

    #Tokenize each sentence
    sanitized_sentences = []
    for s in sentences:
        #use Moses instead of nltk.word_tokenize(s)  - better with apostrophes: cant -> (can + 't) but not (ca + 'n't)
        tokenizer = MosesTokenizer()
        s_tokens = tokenizer.tokenize(s)
        #s_tokens = nltk.word_tokenize(s)
        sanitized_sentences.append(sanitize(s_tokens))

    #Sanitized tokens joined using detokenizer
    detokenizer = MosesDetokenizer()
    if newline:
        return [detokenizer.detokenize(s, return_str=True)+'\n' for s in sanitized_sentences]
    return [detokenizer.detokenize(s, return_str=True) for s in sanitized_sentences]
Esempio n. 26
0
class E2C(object):
    def __init__(self, opt):
        self.opt = opt
        self.sep = opt.seprator + " "
        if opt.cuda:
            torch.cuda.set_device(opt.gpu)
        self.bpe = BPE(codecs.open(opt.bpe_codes, 'r', encoding="UTF-8"),
                       opt.seprator, None, None)

        self.tokenizer = MosesTokenizer()
        self.detokenizer = MosesDetokenizer()
        self.translator = onmt.Translator(opt)

    def tokenDoc(self, doc):
        sentenceList = sent_tokenize(doc.strip())
        print 'e2c sentenceList : ', sentenceList
        tokens = []
        for sent in sentenceList:
            sent = sent.lower()
            sent = self.detokenizer.unescape_xml(
                self.tokenizer.tokenize(sent, return_str=True))
            if self.opt.bpe_codes != "":
                sent = self.bpe.segment(sent).strip()
            token = sent.split()
            tokens += [token]
        return tokens

    def translate(self, doc):
        batch = self.tokenDoc(doc)
        pred, _, _, _, _ = self.translator.translate(batch, None)
        rstr = ""
        #ipdb.set_trace()
        for idx in range(len(pred)):
            rstr += ''.join(' '.join(pred[idx][0]).replace(
                self.sep, '').split()) + "\n\n"
        print 'e2c rstr : ', rstr.strip()
        return rstr.strip()
Esempio n. 27
0
 def chunk_words(self, sentence, language='en'):
     from nltk.tokenize.moses import MosesTokenizer
     tokenizer = MosesTokenizer(lang=language)
     return tokenizer.tokenize(sentence)
Esempio n. 28
0
from collections import Counter
 import matplotlib.pyplot as plt
train_df = pd.read_csv("train.csv")
from nltk.tokenize.moses import MosesTokenizer, MosesDetokenizer
t, d = MosesTokenizer(), MosesDetokenizer()

author_wise = {}
for author_i in train_df.author.unique():
    print(author_i)
    at = train_df[train_df.author == author_i].text.values
    count_not = 0
    count = 0
    count_s =0
    count_is = 0
    for i in range(len(at)):
        print(i)
        if(i!=3808 and i!=4141):
            tokens = t.tokenize(at[i])
            count_s += Counter(tokens)["&apos;s"]
            count_is += Counter(tokens)["is"]
            count_not += Counter(tokens)["not"]
            count += Counter(tokens)["&apos;t"]
    author_wise[author_i] = [count_s, count_is, count_not, count]
    
d={'apos-s': {'EAP':384, 'HPL':612,  'MWS':352},
 'is': {'EAP':1639, 'HPL':364, 'MWS':681},
 'not': {'EAP':1252,'HPL':834, 'MWS':1105},
 'apos-not':{'EAP':87,'HPL':186,'MWS':0}}

pd.DataFrame(d).plot(kind='bar')
plt.show()
Esempio n. 29
0
class NLTKMosesTokenizer(object):
    r"""Apply the Moses Tokenizer implemented in NLTK.

    Users of this class are required to `install NLTK <https://www.nltk.org/install.html>`_
    and install relevant NLTK packages, such as:

    .. code:: python

        python -m nltk.downloader perluniprops nonbreaking_prefixes

    Examples
    --------
    >>> tokenizer = NLTKMosesTokenizer()
    >>> tokenizer("Gluon NLP toolkit provides a suite of text processing tools.")
    ['Gluon',
     'NLP',
     'toolkit',
     'provides',
     'a',
     'suite',
     'of',
     'text',
     'processing',
     'tools',
     '.']
    >>> tokenizer("Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools "
    ...           "zur Verfügung.")
    ['Das',
     'Gluon',
     'NLP-Toolkit',
     'stellt',
     'eine',
     'Reihe',
     'von',
     'Textverarbeitungstools',
     'zur',
     'Verfügung',
     '.']
    """
    def __init__(self):
        try:
            from nltk.tokenize.moses import MosesTokenizer
        except ImportError:
            raise ImportError('NLTK or relevant packages are not installed. You must install NLTK '
                              'in order to use the NLTKMosesTokenizer. You can refer to the '
                              'official installation guide in https://www.nltk.org/install.html .')
        self._tokenizer = MosesTokenizer()

    def __call__(self, sample):
        """

        Parameters
        ----------
        sample: str
            The sentence to tokenize

        Returns
        -------
        ret : list of strs
            List of tokens
        """
        return self._tokenizer.tokenize(sample)
Esempio n. 30
0
class TextProcessor:
    def __init__(self):
        # self._no_punct_pattern = re.compile('[a-zA-Z0-9- ]')
        self._tok = MosesTokenizer(lang='en')
        self._stemmer = SnowballStemmer('english')
        self._lemmatizer = TreeTagger(language='english')
        self._stopwords = set(open(STOPWORDS).read().splitlines())
        # istopwords.words('french') #
        self._porter_stemmer = nltk.stem.porter.PorterStemmer()
        # self._sent_tokenizer = util.load_pickle('%s%s'
        # % (STATIC_DATA_ROOT, 'punkt/m07_punkt.pickle'))
        # self._sent_split_ABBR_LIST = set(['Mr.', 'Mrs.', 'Sen.', 'No.',
        # 'Dr.', 'Gen.', 'St.', 'Lt.', 'Col.', 'Capt.'])
        # self._sent_split_PUNCT_LIST = set(['\" ', '\")', ') ', '\' ',
        # '\"\''])

    def sent_split(self, text):
        return nltk.sent_tokenize(text, language='english')

    def tokenize(self, text):
        return self._tok.tokenize(text, escape=False)

    def porter_stem(self, word):
        return self._porter_stemmer.stem(word)

    def remove_stopwords(self, words):
        return [w for w in words if w not in self._stopwords]

    def remove_pos_stopwords(self, words, pos):
        list_lemm = []
        list_pos = []
        for w, p in zip(words, pos):
            if w not in self._stopwords:
                list_lemm.append(w)
                list_pos.append(p)
        return list_lemm, list_pos

    def is_just_stopwords(self, words):
        if type(words) == type(''):
            words = words.split()
        for word in words:
            if word not in self._stopwords:
                return False
        return True

    def remove_punct(self, sentence):
        """
        Remove punctuation from sentence as str
        :param sentence: str: sentence with punctuation
        :return: str: sentence without punctuation
        """
        return re.sub('[' + string.punctuation + ']+', '', sentence).strip()
        # return re.sub(r'[^a-zA-Z0-9- ]', '', sentence).strip()

    def remove_punct_sent(self, sentence):
        return [
            self.remove_punct(word) for word in sentence
            if len(self.remove_punct(word)) > 0
        ]

    def is_punct(self, text):
        """
        returns true if the text (str) consists solely of non alpha-numeric
        characters
        """
        for letter in text.lower():
            if letter not in set(string.punctuation):
                return False

    def lemm_sent(self, sent):
        if self._lemmatizer is None:
            return sent
        else:
            lemm_sent = []
            lemm_pos = []
            for tup in self._lemmatizer.tag(sent):
                if tup[2] != "<unknown>":
                    if "PUN" not in tup[1] and tup[1] != "SENT":
                        lemm_sent.append(tup[2].split('|')[0])
                        lemm_pos.append(tup[1])
                else:
                    lemm_sent.append(tup[0])
                    lemm_pos.append('u')
            return lemm_sent, lemm_pos

    def stem_sent(self, sent):
        return [self.stem(word) for word in sent]

    def stem(self, word):
        if self._stemmer is None:
            return word
        else:
            return self._stemmer.stem(word)
Esempio n. 31
0
# Initialize Moses tokenizer
tokenizer = MosesTokenizer()

# Count OOVs
counts = defaultdict(lambda:0)
with open(input_file, 'r') as f:
    for l in f:
        # Get first field
        comment = l.split('\t')[0]
        # Remove urls
        comment = ' '.join(filter(lambda w: not text.contains_url(w), comment.split()))
        # Normalize punctuation
        comment = text.normalize_punctuation(comment)
        # Tokenize with the moses tokenizer
        sentence = tokenizer.tokenize(comment)
        for w in sentence:
            # check whether the word is in the WMT dictionary
            if w.lower() not in dic:
                counts[w] += 1
# Sort by counts
sorted_counts = sorted(counts.items(), key=lambda x: x[1])

# Total number of OOVs
tot_oovs = sum(counts.values())

# Save to file
with open(oov_freqs_output_file, 'w+') as f:
    # Print frequency from most frequent to less frequent
    for w, count in reversed(sorted_counts):
        print('%s\t%.3f%%' % (w.encode('utf-8'), count / tot_oovs * 100), file=f)
Esempio n. 32
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from nltk.tokenize import sent_tokenize
from nltk.tokenize.moses import MosesTokenizer
import argparse
import io
import logging
import sys
import tqdm

tokenizer = MosesTokenizer(lang='en')
with io.TextIOWrapper(sys.stdin.buffer, encoding='8859') as sin:
    for line in tqdm.tqdm(sin):
        if line.startswith('CURRENT URL'):
            continue
        for sent in sent_tokenize(line.strip()):
            print(tokenizer.tokenize(sent, return_str=True).lower())
Esempio n. 33
0
    p.add_argument('-column',
                   required=True,
                   help='column name to use. headline or short_description')

    p.add_argument('-output', required=True, help='data file name to write')

    config = p.parse_args()

    return config


if __name__ == "__main__":
    config = argparser()

    corpus = pd.read_json(config.input, lines=True).loc[:, config.column]
    corpus = remove_emoji.remove(corpus)

    tokenizer = MosesTokenizer()

    sys.stdout = open(config.output, 'w')

    for line in corpus:
        if line.replace('\n', '').strip() != '':
            # tokenization
            tokens = tokenizer.tokenize(line.replace('\n', '').strip(),
                                        escape=False)
            sys.stdout.write(' '.join(tokens) + '\n')
        else:
            sys.stdout.write('\n')