def normalize_text(html):
    try:
        url_re = re.compile("https{0,1}://[^\s]+")
        url2_re = re.compile("[a-z0-9\.]+\.[a-z0-9\.]+/[^\s]*")
        space_re = re.compile("[\s]{2,}")

        html = html.encode("ascii", errors="ignore")
        text = newspaper.fulltext(html)
        
        sent = text.encode('ascii', errors='ignore')
        sent = str(sent).replace("r\\", "")
        sent = str(sent).replace("n\\", "")
        sent = str(sent).replace("\\", "")
        text = sent

        t, d = MosesTokenizer(), MosesDetokenizer()
        tokens = t.tokenize(text)
        detokens = d.detokenize(tokens)
        text = " ".join(detokens)
            # Removing URLs
        text = url_re.sub(" ", text)
        text = url2_re.sub(" ", text)
            
        # Removing multiple spacing characters
        text = space_re.sub(" ", text)

        text = text.encode("ascii", errors="ignore").decode()
        text = preProcess(text)
            # Stripping leading and trailing spaces
        text = text.strip()
        return text
    except Exception as e:
        return ""
Example #2
0
def clean_text(raw_text, get_questions=False):
    """
    Words consist of letters or numbers
    :param raw_text: text (not divided into sentences)
    :return: list of sanitized sentences
    """
    # Tokenize text into sentences.
    raw_text = delete_parenthesis(raw_text)

    sentences = nltk.sent_tokenize(raw_text)

    #Tokenize each sentence
    sanitized_sentences = []
    for s in sentences:
        #use Moses instead of nltk.word_tokenize(s)  - better with apostrophes: cant -> (can + 't) but not (ca + 'n't)
        tokenizer = MosesTokenizer()
        s_tokens = tokenizer.tokenize(s)
        #s_tokens = nltk.word_tokenize(s)
        if (not get_questions
                and s_tokens[-1] != '?') or (get_questions
                                             and s_tokens[-1] == '?'):
            sanitized_sentences.append(sanitize(s_tokens))

    #Sanitized tokens joined using detokenizer
    detokenizer = MosesDetokenizer()
    return [
        detokenizer.detokenize(s, return_str=True) for s in sanitized_sentences
    ]
Example #3
0
def print_unrolled_stats(unrolled_data):
    counter = dict()
    sentiment_counter = defaultdict(int)
    length_list = []
    tk = MosesTokenizer()

    aspects = set()
    for x in unrolled_data:
        aspects.add(x['aspect'])
    for a in aspects:
        counter[a] = defaultdict(int)
    for e in unrolled_data:
        counter[e['aspect']][e['sentiment']] += 1
        length_list.append(len(tk.tokenize((e['sentence']))))
    for aspect in sorted(counter.keys()):
        total = 0
        for sentiment in sorted(counter[aspect].keys()):
            print('# {}\t\t{}:\t{}'.format(aspect, sentiment,
                                           counter[aspect][sentiment]))
            total += counter[aspect][sentiment]
            sentiment_counter[sentiment] += counter[aspect][sentiment]
        counter[aspect]['total'] = total
        print('# {}\t\t{}:\t{}'.format(aspect, 'total', total))
        print()
    print(sentiment_counter)
    return counter
def process_data(sequences_text):
    load_wordvec_dict()
    t = MosesTokenizer()
    sequences = np.empty_like(sequences_text)
    num_unrecognized = 0
    unrecognized_words = {}
    for i, s in enumerate(sequences_text):
        s = clean_string(s)
        s_t = t.tokenize(s, escape=False)
        s_t = [w.lower() for w in s_t]
        for j, w in enumerate(s_t):
            try:
                s_t[j] = vocab.index(w)
            except ValueError:
                # add vocabulary item
                vocab.append(w)
                # add embeddings item
                embds.append([0] * embds_dim)
                s_t[j] = len(vocab) - 1
                num_unrecognized += 1
                unrecognized_words[w] = 1
        sequences[i] = s_t
    print("Unrecognized vectors:::", num_unrecognized)
    print("Unrecognized words:::", unrecognized_words.keys())
    print("Processing Data Finished")
    return sequences
def process_data(vocab_size, batch_size, skip_window):
    client = MongoClient()
    db = client.nyt
    collection = db["caratulas"]
    start_date = datetime(2016, 1, 1, 0, 0, 0)
    end_date = datetime(2017, 1, 1, 0, 0, 0)
    cursor = collection.find({
        "$and": [{
            "lead_paragraph": {
                "$exists": True,
                "$nin": [None]
            }
        }, {
            "pub_date": {
                "$exists": True,
                "$lt": end_date,
                "$gte": start_date
            }
        }]
    })
    articles = [x["lead_paragraph"].lower() for x in cursor]
    tokenizer = MosesTokenizer()
    articles_tok = [tokenizer.tokenize(x) for x in articles]
    flat_art = [x for article in articles_tok for x in article]
    dictionary, _ = build_vocab(flat_art, vocab_size)
    index_words = convert_words_to_index(articles_tok, dictionary)
    del flat_art  # to save memory
    del articles_tok
    single_gen = generate_sample(index_words, skip_window)
    return get_batch(single_gen, batch_size)
 def __init__(self, name=__name__, phrasefile="", verbose=False):
     if verbose: print("Initializing preprocessor %s"%name)
     self.TOKENIZER = MosesTokenizer(lang='en')
     self.STEMMER = PorterStemmer(mode='NLTK_EXTENSIONS')
     self.STOPWORDS = set(stopwords.words('english'))
     self.TAGS_RE = re.compile('<.*?>')                        
     self.PHRASESPOTTER = None if phrasefile=="" else phrasespotter(phrasefile=phrasefile, verbose=verbose)
Example #7
0
 def __init__(self):
     try:
         from sacremoses import MosesTokenizer
         self._tokenizer = MosesTokenizer()
     except (ImportError, TypeError) as err:
         if isinstance(err, TypeError):
             warnings.warn(
                 'The instantiation of MosesTokenizer in sacremoses is'
                 ' currently only supported in python3.'
                 ' Now try NLTKMosesTokenizer using NLTK ...')
         else:
             warnings.warn(
                 'sacremoses is not installed. '
                 'To install sacremoses, use pip install -U sacremoses'
                 ' Now try NLTKMosesTokenizer using NLTK ...')
         try:
             from nltk.tokenize.moses import MosesTokenizer
             self._tokenizer = MosesTokenizer()
         except ImportError:
             raise ImportError(
                 'NLTK is also not installed. '
                 'You must install NLTK <= 3.2.5 in order to use the '
                 'NLTKMosesTokenizer. You can refer to the official '
                 'installation guide in https://www.nltk.org/install.html .'
             )
Example #8
0
 def __init__(self):
     # self._no_punct_pattern = re.compile('[a-zA-Z0-9- ]')
     self._tok = MosesTokenizer(lang='en')
     self._stemmer = SnowballStemmer('english')
     self._lemmatizer = TreeTagger(language='english')
     self._stopwords = set(open(STOPWORDS).read().splitlines())
     # istopwords.words('french') #
     self._porter_stemmer = nltk.stem.porter.PorterStemmer()
Example #9
0
 def __init__(self):
     try:
         from nltk.tokenize.moses import MosesTokenizer
     except Exception as ex:
         import nltk
         nltk.download('perluniprops')
         nltk.download('nonbreaking_prefixes')
     self.tokenizer = MosesTokenizer()
	def build_set(self):
		wn.ensure_loaded()  # `LazyCorpusLoader` conversion into `WordNetCorpusReader` starts
		print ("WordNet loaded")
		swn.ensure_loaded()  # `LazyCorpusLoader` conversion into `SentiWordNetCorpusReader` starts
		print ("SentiWordNet loaded")
		self.tweet_tokenizer = TweetTokenizer(preserve_case=True, reduce_len=False, strip_handles=False)
		print ("Tweet tokenizer loaded")
		self.it_tokenizer = MosesTokenizer(lang='it')
		print ("Moses tokenizer loaded")
		self.it_tagger = treetaggerwrapper.TreeTagger(TAGLANG="it", TAGDIR=flags.tagger_path)
		# self.en_tagger = treetaggerwrapper.TreeTagger(TAGLANG="en", TAGDIR=flags.tagger_path)
		print ("Tagger loaded")
		self.stop_words = set(stopwords.words('italian'))
		print ("Stopwords loaded")
		self.lexicon = lm.LexiconSent('it')
		print ("OpeNER lexicon loaded")
		self.emoji = self.get_emoji_sentiment_lexicon(flags.emoji_sentiment_lexicon)
		print ("Emoji sentiment lexicon loaded")
		self.translator = Translator()
		print ("Setting up support dictionaries")
		self.translated_lemma_tokens = self.load_obj(flags.translated_lemma_tokens)
		self.lexeme_sentiment_dict = self.load_obj(flags.lexeme_sentiment_dict)
		print ("Translator loaded")
		# Build test annotations
		print ("Building test annotations..")
		test_set = self.load_obj(flags.test_annotations)
		if not test_set:
			test_set = self.get_annotations(flags.test_set_path)
			self.save_obj(test_set, flags.test_annotations)
		print ("Test annotations built")
		# Build training annotations
		print ("Building training annotations..")
		training_set = self.load_obj(flags.training_annotations)
		if not training_set:
			training_set = self.get_annotations(flags.training_set_path)
			self.save_obj(training_set, flags.training_annotations)
		print ("Training annotations built")
		print ("Saving support dictionaries")
		self.save_obj(self.translated_lemma_tokens, flags.translated_lemma_tokens)
		self.save_obj(self.lexeme_sentiment_dict, flags.lexeme_sentiment_dict)
		# Build distributional docvec from training and test sets
		self.doc2vec = self.build_distributional_docvec([test_set, training_set])
		print ("Doc2Vec built")
		self.add_context_to_annotations(test_set)
		print ("Distr. docvec added to test annotations")
		self.add_context_to_annotations(training_set)
		print ("Distr. docvec added to training annotations")
		self.free_ram()
		print ("Loading pre-trained model..")
		self.model = ft.load_model(flags.word2vec_path)
		print ("Pre-trained model loaded")
		self.add_wordvecs_to_annotations(test_set)
		print ("Wordvecs added to test annotations")
		self.add_wordvecs_to_annotations(training_set)
		print ("Wordvecs added to training annotations")
		# Save to npy
		self.free_ram()
		self.save_obj({"test_set":test_set, "training_set":training_set}, flags.preprocessed_dict)
 def tokenize(txt, to_lower=False):
     assert isinstance(txt, str)
     tokenizer = MosesTokenizer()
     lines = txt.split('\n')
     t = [tokenizer.tokenize(line) for line in lines]
     if to_lower:
         return [[word.lower() for word in line] for line in t]
     else:
         return t
Example #12
0
    def moses_init(self):
        from nltk.tokenize.moses import MosesTokenizer
        from nltk.tokenize import sent_tokenize
        self.model_punkt = sent_tokenize
        self.model_moses = MosesTokenizer(self.lang)

        self.parse = self._parse
        self.sent_seger = self.punkt_sent_seger
        self.tokenizer = self.moses_tokenizer
        self.processor = None
Example #13
0
 def __init__(self):
     try:
         from nltk.tokenize.moses import MosesTokenizer
     except ImportError:
         raise ImportError(
             'NLTK or relevant packages are not installed. You must install NLTK '
             'in order to use the NLTKMosesTokenizer. You can refer to the '
             'official installation guide in https://www.nltk.org/install.html .'
         )
     self._tokenizer = MosesTokenizer()
Example #14
0
    def __init__(self, filename, genia, gen_features, lowercase,
                 replace_digits, to_filter):
        self.filename = filename
        self.basename = os.path.basename(filename)
        self.protocol_name = self.basename
        self.text_file = self.filename + '.txt'
        self.ann_file = self.filename + '.ann'

        with io.open(self.text_file, 'r', encoding='utf-8',
                     newline='') as t_f, io.open(self.ann_file,
                                                 'r',
                                                 encoding='utf-8',
                                                 newline='') as a_f:
            self.tokenizer = MosesTokenizer()
            self.lines = []
            for line in t_f.readlines():
                self.lines.append(html.unescape(line))

            self.text = "".join(self.lines)  # full text
            self.ann = a_f.readlines()
            self.status = self.__pretest()
            self.links = []

        if self.status:
            sents = [self.tokenizer.tokenize(line)
                     for line in self.lines]  # generate list of list of words
            self.heading = sents[0]
            self.sents = sents[1:]
            self.tags = self.__parse_tags()
            self.unique_tags = set([tag.tag_name for tag in self.tags])
            self.__std_index()
            self.__parse_links()
            self.tag_0_id = 'T0'
            self.tag_0_name = 'O'
            self.tokens2d = self.gen_tokens(labels_allowed=cfg.LABELS,
                                            lowercase=lowercase,
                                            replace_digits=replace_digits)
            self.tokens2d = [[self.clean_html_tag(token) for token in token1d]
                             for token1d in self.tokens2d]

            self.word_cnt = sum(len(tokens1d) for tokens1d in self.tokens2d)
            self.f_df = None
            if gen_features:
                if genia:
                    self.pos_tags = self.__gen_pos_genia(genia)
                else:
                    self.pos_tags = self.__gen_pos_stanford()

                self.conll_deps = self.__gen_dep()
                self.parse_trees = self.__gen_parse_trees()

            if to_filter:
                self.filter()

            self.relations = self.gen_relations()
Example #15
0
    def __init__(self, opt):
        self.opt = opt
        self.sep = opt.seprator + " "
        if opt.cuda:
            torch.cuda.set_device(opt.gpu)
        self.bpe = BPE(codecs.open(opt.bpe_codes, 'r', encoding="UTF-8"),
                       opt.seprator, None, None)

        self.tokenizer = MosesTokenizer()
        self.detokenizer = MosesDetokenizer()
        self.translator = onmt.Translator(opt)
Example #16
0
    def test_Diff_btw_perl_package(self):
        # this test special case that will fail:
        # for any multi-dot ending, tokenizer will add space in between
        with open(self.min_data_path) as f:
            line = f.readline()
        tokenizer_cmd = [self.perl_path, "-l", 'en', "-q", "-"]
        tokenizer_perl = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)
        perl_sentence, _ = tokenizer_perl.communicate(line)

        package_sentence = MosesTokenizer().tokenize(line, return_str=True)

        self.assertEqual(perl_sentence, package_sentence.encode('utf8'))
Example #17
0
def read_sentence14_target(file_path, max_offset_len=83):
    tk = MosesTokenizer()
    with open(file_path, 'rb') as fopen:
        raw = fopen.read()
        root = etree.fromstring(raw)
        for sentence in root:
            example = dict()
            example["sentence"] = sentence.find('text').text.lower()

            # for RAN
            tokens = tk.tokenize(example['sentence'])

            terms = sentence.find('aspectTerms')
            if terms is None:
                continue
            example["aspect_sentiment"] = []
            example["left_right"] = []
            example['offset'] = []

            for c in terms:
                target = c.attrib['term'].lower()
                example["aspect_sentiment"].append(
                    (target, c.attrib['polarity']))

                # for td lstm
                left_index = int(c.attrib['from'])
                right_index = int(c.attrib['to'])
                example["left_right"].append(
                    (example['sentence'][:right_index],
                     example['sentence'][left_index:], c.attrib['polarity']))

                # for RAN
                left_word_offset = len(
                    tk.tokenize(example['sentence'][:left_index]))
                right_word_offset = len(
                    tk.tokenize(example['sentence'][right_index:]))
                token_index = list(range(len(tokens)))
                token_length = float(len(token_index))
                for i in range(len(tokens)):
                    if i < left_word_offset:
                        token_index[i] = 1 - (left_word_offset -
                                              token_index[i]) / token_length
                    elif i >= right_word_offset:
                        token_index[i] = 1 - (token_index[i] -
                                              (len(tokens) - right_word_offset)
                                              + 1) / token_length
                    else:
                        token_index[i] = 0
                token_index += [-1.] * (max_offset_len - len(tokens))
                example['offset'].append(
                    (token_index, target, c.attrib['polarity']))
            yield example
def process_hierarchical_data(sequences):
    load_wordvec_dict()

    t = MosesTokenizer()
    processed_sequences = np.zeros_like(sequences)
    for i, seq in enumerate(sequences):
        seq = clean_string(seq)
        sentences = sent_tokenize(seq)
        for z, sent in enumerate(sentences):
            sent_t = t.tokenize(sent)
            sent_t = [w.lower() for w in sent_t]
            for j, w in enumerate(sent_t):
                try:
                    sent_t[j] = vocab.index(w)
                except ValueError:
                    # add vocabulary item
                    vocab.append(w)
                    # add embeddings item
                    embds.append([0] * embds_dim)
                    sent_t[j] = len(vocab) - 1

            sentences[z] = sent_t
        processed_sequences[i] = sentences
    seq_lengths = np.asarray(list(map(len, processed_sequences)))
    sent_lengths = np.asarray(
        [list(map(len, seq)) for seq in processed_sequences])
    sent_lengths = pad_sequences(sent_lengths, max_length_allowed=100)[0]
    print("seq_length shape: ")
    print(seq_lengths.shape)
    print(seq_lengths[0:3])
    print("sent_length shape: ")
    print(sent_lengths.shape)
    print(sent_lengths[0:3])
    print("max_sent_length")
    print(sent_lengths.max())
    max_seq_length = seq_lengths.max()
    max_sent_length = sent_lengths.max()  # weird that max returns a list

    processed_sequences = np.asarray([
        pad_sequences(seq,
                      max_length_allowed=max_sent_length,
                      length=max_sent_length,
                      padding_val=0)[0] for seq in processed_sequences
    ])
    processed_sequences = pad_sequences(processed_sequences,
                                        max_length_allowed=max_seq_length,
                                        length=max_seq_length,
                                        padding_val=np.zeros_like(
                                            processed_sequences[0])[0])[0]

    print("Processing Data Finished")
    return processed_sequences, sent_lengths, seq_lengths
Example #19
0
def print_unrolled_stats_atsa(unrolled_data):
    counter = defaultdict(int)
    length_list = []
    tk = MosesTokenizer()

    for e in unrolled_data:
        counter[e['sentiment']] += 1
        length_list.append(len(tk.tokenize((e['sentence']))))

    for sentiment in sorted(counter.keys()):
        print('#{}:\t{}'.format(sentiment, counter[sentiment]))

    return counter
Example #20
0
def tokenize_text(text):
    # Tokenizers are basically an advanced split
    tokenizer = MosesTokenizer()
    detokenizer = MosesDetokenizer()

    processed_text = tokenizer.tokenize(text)

    # Need to detokenize to get all the weird symbols back as symbols
    processed_text = detokenizer.detokenize(processed_text)

    processed_text = preprocess(processed_text)

    return " ".join(processed_text)
Example #21
0
def _process_caption(caption):
    """Processes a caption string into a list of tokenized words.

    Args:
      caption: A string caption.

    Returns:
      A list of strings; the tokenized caption.
    """
    tokenizer = MosesTokenizer()
    tokenized_caption = ["SEQUENCE_START"]
    tokenized_caption.extend(tokenizer.tokenize(caption.lower()))
    tokenized_caption.append("SEQUENCE_END")
    return tokenized_caption
def cut_words(data):
    #stopWords = set(nltk.corpus.stopwords.words('english'))
    stopwords = nltk.corpus.stopwords.words('english')
    #新增stopwords
    for i in import_stop:
        stopwords.append(i)
    #stopwords.append(':')
    moses = MosesTokenizer()
    words = moses.tokenize(data)
    wordsFiltered = []

    for w in words:
        if w not in stopwords:
            wordsFiltered.append(w)
    return (wordsFiltered)
Example #23
0
class NLTKMosesTokenizer(Component):
    """Class for splitting texts on tokens using NLTK wrapper over MosesTokenizer

    Attributes:
        escape: whether escape characters for use in html markup
        tokenizer: tokenizer instance from nltk.tokenize.moses
        detokenizer: detokenizer instance from nltk.tokenize.moses

    Args:
        escape: whether escape characters for use in html markup
    """

    def __init__(self, escape: bool=False, *args, **kwargs):
        self.escape = escape
        self.tokenizer = MosesTokenizer()
        self.detokenizer = MosesDetokenizer()

    def __call__(self, batch: List[Union[str, List[str]]]) -> List[Union[List[str], str]]:
        """Tokenize given batch of strings or detokenize given batch of lists of tokens

        Args:
            batch: list of text samples or list of lists of tokens

        Returns:
            list of lists of tokens or list of text samples
        """
        if isinstance(batch[0], str):
            return [self.tokenizer.tokenize(line, escape=self.escape) for line in batch]
        else:
            return [self.detokenizer.detokenize(line, return_str=True, unescape=self.escape)
                    for line in batch]
Example #24
0
File: utils.py Project: xkuang/text
def get_tokenizer(tokenizer):
    if callable(tokenizer):
        return tokenizer
    if tokenizer == "spacy":
        try:
            import spacy
            spacy_en = spacy.load('en')
            return lambda s: [tok.text for tok in spacy_en.tokenizer(s)]
        except ImportError:
            print("Please install SpaCy and the SpaCy English tokenizer. "
                  "See the docs at https://spacy.io for more information.")
            raise
        except AttributeError:
            print("Please install SpaCy and the SpaCy English tokenizer. "
                  "See the docs at https://spacy.io for more information.")
            raise
    elif tokenizer == "moses":
        try:
            from nltk.tokenize.moses import MosesTokenizer
            moses_tokenizer = MosesTokenizer()
            return moses_tokenizer.tokenize
        except ImportError:
            print("Please install NLTK. "
                  "See the docs at http://nltk.org for more information.")
            raise
        except LookupError:
            print("Please install the necessary NLTK corpora. "
                  "See the docs at http://nltk.org for more information.")
            raise
    raise ValueError("Requested tokenizer {}, valid choices are a "
                     "callable that takes a single string as input, "
                     "\"spacy\" for the SpaCy English tokenizer, or "
                     "\"moses\" for the NLTK port of the Moses tokenization "
                     "script.".format(tokenizer))
Example #25
0
class NLTKMosesTokenizer:
    """Create the Moses Tokenizer implemented by in NLTK.

    From:
        https://www.nltk.org/_modules/nltk/tokenize/moses.html
    
    Examples:
    >>> tokenizer = prenlp.tokenizer.NLTKMosesTokenizer()
    >>> tokenizer('PreNLP package provides a variety of text preprocessing tools.')
    ['PreNLP', 'package', 'provides', 'a', 'variety', 'of', 'text', 'preprocessing', 'tools', '.']
    >>> tokenizer.tokenize('PreNLP package provides a variety of text preprocessing tools.')
    ['PreNLP', 'package', 'provides', 'a', 'variety', 'of', 'text', 'preprocessing', 'tools', '.']
    """
    def __init__(self):
        try:
            from nltk.tokenize.moses import MosesTokenizer
        except Exception as ex:
            import nltk
            nltk.download('perluniprops')
            nltk.download('nonbreaking_prefixes')
        self.tokenizer = MosesTokenizer()

    def __call__(self, text: str) -> List[str]:
        return self.tokenize(text)

    def tokenize(self, text: str) -> List[str]:
        return self.tokenizer.tokenize(text, escape=False)
Example #26
0
class SacreMosesTokenizer(object):
    """Apply the Moses Tokenizer implemented in sacremoses.

    Users of this class are required to install
    `sacremoses <https://github.com/alvations/sacremoses>`_.
    For example, one can use :samp:`pip install sacremoses`.

    .. note::
        sacremoses carries an LGPL 2.1+ license.

    Examples
    --------
    >>> tokenizer = gluonnlp.data.SacreMosesTokenizer()
    >>> tokenizer("Gluon NLP toolkit provides a suite of text processing tools.")
    ['Gluon', 'NLP', 'toolkit', 'provides', 'a', 'suite', 'of', 'text', 'processing', 'tools', '.']
    >>> tokenizer("Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools "
    ...           "zur Verfügung.")
    ['Das', 'Gluon', 'NLP-Toolkit', 'stellt', 'eine', 'Reihe', 'von', 'Textverarbeitungstools', \
'zur', 'Verf\xfcgung', '.']
    """

    def __init__(self):
        try:
            from sacremoses import MosesTokenizer
            self._tokenizer = MosesTokenizer()
        except (ImportError, TypeError) as err:
            if isinstance(err, TypeError):
                warnings.warn('The instantiation of MosesTokenizer in sacremoses is'
                              ' currently only supported in python3.'
                              ' Now try NLTKMosesTokenizer using NLTK ...')
            else:
                warnings.warn('sacremoses is not installed. '
                              'To install sacremoses, use pip install -U sacremoses'
                              ' Now try NLTKMosesTokenizer using NLTK ...')
            try:
                from nltk.tokenize.moses import MosesTokenizer
                self._tokenizer = MosesTokenizer()
            except ImportError:
                raise ImportError('NLTK is also not installed. '
                                  'You must install NLTK <= 3.2.5 in order to use the '
                                  'NLTKMosesTokenizer. You can refer to the official '
                                  'installation guide in https://www.nltk.org/install.html .')

    def __call__(self, sample, return_str=False):
        """

        Parameters
        ----------
        sample: str
            The sentence to tokenize
        return_str: bool, default False
            True: return a single string
            False: return a list of tokens

        Returns
        -------
        ret : list of strs or str
            List of tokens or tokenized text
        """
        return self._tokenizer.tokenize(sample, return_str=return_str)
Example #27
0
class NLTKMosesTokenizer(object):
    """Apply the Moses Tokenizer implemented in NLTK.

    Users of this class are required to install `NLTK <https://www.nltk.org/install.html>`_
    and install relevant NLTK packages, such as
    :samp:`python -m nltk.downloader perluniprops nonbreaking_prefixes`.

    Examples
    --------
    >>> tokenizer = gluonnlp.data.NLTKMosesTokenizer()
    >>> tokenizer('Gluon NLP toolkit provides a suite of text processing tools.')
    ['Gluon', 'NLP', 'toolkit', 'provides', 'a', 'suite', 'of', 'text', 'processing', 'tools', '.']
    >>> tokenizer('Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools '
    ...           'zur Verfügung.')
    ['Das', 'Gluon', 'NLP-Toolkit', 'stellt', 'eine', 'Reihe', 'von', 'Textverarbeitungstools', \
'zur', 'Verf\xfcgung', '.']
    """
    def __init__(self):
        try:
            from nltk.tokenize.moses import MosesTokenizer
        except ImportError:
            warnings.warn(
                'NLTK or relevant packages are not installed. '
                'Due to the LGPL 2.1+, moses has been deprecated in NLTK since 3.3.0. '
                'You must install NLTK <= 3.2.5 in order to use the '
                'NLTKMosesTokenizer. You can refer to the official '
                'installation guide in https://www.nltk.org/install.html .'
                ' Now try SacreMosesTokenizer using sacremoses ...')
            try:
                from sacremoses import MosesTokenizer
            except ImportError:
                raise ImportError(
                    'sacremoses is also not installed. '
                    'Please use sacremoses or older nltk version, e.g. 3.2.5. '
                    'To install sacremoses, use pip install -U sacremoses')
        try:
            self._tokenizer = MosesTokenizer()
        except ValueError:
            raise ValueError(
                'The instantiation of MosesTokenizer in sacremoses is'
                ' currently only supported in python3.')

    def __call__(self, sample, return_str=False):
        """

        Parameters
        ----------
        sample: str
            The sentence to tokenize
        return_str: bool, default False
            True: return a single string
            False: return a list of tokens

        Returns
        -------
        ret : list of strs or str
            List of tokens or tokenized text
        """
        return self._tokenizer.tokenize(sample, return_str=return_str)
Example #28
0
class SacreMosesTokenizer(object):
    """Apply the Moses Tokenizer implemented in sacremoses.

    Users of this class are required to install
    `sacremoses <https://github.com/alvations/sacremoses>`_.
    For example, one can use :samp:`pip install sacremoses`.

    .. note::
        sacremoses carries an LGPL 2.1+ license.

    Examples
    --------
    >>> tokenizer = gluonnlp.data.SacreMosesTokenizer()
    >>> tokenizer("Gluon NLP toolkit provides a suite of text processing tools.")
    ['Gluon', 'NLP', 'toolkit', 'provides', 'a', 'suite', 'of', 'text', 'processing', 'tools', '.']
    >>> tokenizer("Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools "
    ...           "zur Verfügung.")
    ['Das', 'Gluon', 'NLP-Toolkit', 'stellt', 'eine', 'Reihe', 'von', 'Textverarbeitungstools', \
'zur', 'Verf\xfcgung', '.']
    """
    def __init__(self):
        try:
            from sacremoses import MosesTokenizer
            self._tokenizer = MosesTokenizer()
        except (ImportError, TypeError) as err:
            if isinstance(err, TypeError):
                warnings.warn('The instantiation of MosesTokenizer in sacremoses is'
                              ' currently only supported in python3.'
                              ' Now try NLTKMosesTokenizer using NLTK ...')
            else:
                warnings.warn('sacremoses is not installed. '
                              'To install sacremoses, use pip install -U sacremoses'
                              ' Now try NLTKMosesTokenizer using NLTK ...')
            try:
                from nltk.tokenize.moses import MosesTokenizer
                self._tokenizer = MosesTokenizer()
            except ImportError:
                raise ImportError('NLTK is also not installed. '
                                  'You must install NLTK <= 3.2.5 in order to use the '
                                  'NLTKMosesTokenizer. You can refer to the official '
                                  'installation guide in https://www.nltk.org/install.html .')

    def __call__(self, sample, return_str=False):
        """

        Parameters
        ----------
        sample: str
            The sentence to tokenize
        return_str: bool, default False
            True: return a single string
            False: return a list of tokens

        Returns
        -------
        ret : list of strs or str
            List of tokens or tokenized text
        """
        return self._tokenizer.tokenize(sample, return_str=return_str)
Example #29
0
 def __init__(self):
     try:
         from nltk.tokenize.moses import MosesTokenizer
     except ImportError:
         raise ImportError('NLTK or relevant packages are not installed. You must install NLTK '
                           'in order to use the NLTKMosesTokenizer. You can refer to the '
                           'official installation guide in https://www.nltk.org/install.html .')
     self._tokenizer = MosesTokenizer()
Example #30
0
def tokenize(msg, tokenizer):
    if tokenizer == 'simple':
        tokens = msg.split(' ')
    elif tokenizer == 'split':
        tokens = msg.split()
    elif tokenizer == 'moses':
        tokens = MosesDetokenizer().unescape_xml(MosesTokenizer().tokenize(
            msg, return_str=True)).split(' ')
    return (md5_hash(' '.join(tokens)), tokens)
Example #31
0
class NLTKMosesTokenizer(object):
    """Apply the Moses Tokenizer implemented in NLTK.

    Users of this class are required to install `NLTK <https://www.nltk.org/install.html>`_
    and install relevant NLTK packages, such as
    :samp:`python -m nltk.downloader perluniprops nonbreaking_prefixes`.

    Examples
    --------
    >>> tokenizer = gluonnlp.data.NLTKMosesTokenizer()
    >>> tokenizer("Gluon NLP toolkit provides a suite of text processing tools.")
    ['Gluon', 'NLP', 'toolkit', 'provides', 'a', 'suite', 'of', 'text', 'processing', 'tools', '.']
    >>> tokenizer("Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools "
    ...           "zur Verfügung.")
    ['Das', 'Gluon', 'NLP-Toolkit', 'stellt', 'eine', 'Reihe', 'von', 'Textverarbeitungstools', \
'zur', 'Verf\xfcgung', '.']
    """
    def __init__(self):
        try:
            from nltk.tokenize.moses import MosesTokenizer
        except ImportError:
            warnings.warn('NLTK or relevant packages are not installed. '
                          'Due to the LGPL 2.1+, moses has been deprecated in NLTK since 3.3.0. '
                          'You must install NLTK <= 3.2.5 in order to use the '
                          'NLTKMosesTokenizer. You can refer to the official '
                          'installation guide in https://www.nltk.org/install.html .'
                          ' Now try SacreMosesTokenizer using sacremoses ...')
            try:
                from sacremoses import MosesTokenizer
            except ImportError:
                raise ImportError('sacremoses is also not installed. '
                                  'Please use sacremoses or older nltk version, e.g. 3.2.5. '
                                  'To install sacremoses, use pip install -U sacremoses')
        try:
            self._tokenizer = MosesTokenizer()
        except ValueError:
            raise ValueError('The instantiation of MosesTokenizer in sacremoses is'
                             ' currently only supported in python3.')

    def __call__(self, sample, return_str=False):
        """

        Parameters
        ----------
        sample: str
            The sentence to tokenize
        return_str: bool, default False
            True: return a single string
            False: return a list of tokens

        Returns
        -------
        ret : list of strs or str
            List of tokens or tokenized text
        """
        return self._tokenizer.tokenize(sample, return_str=return_str)
    def get_vector(self, inputs, tokenized_corpus, max_word_num, max_sequence_len):
        loader = data_loader.DataLoader(inputs)
        self.data = pd.DataFrame({'title': loader.title, 'context': loader.context, 'question':loader.question, 'answer_start':loader.answer_start, 'answer_end':loader.answer_end, 'answer_text':loader.answer_text})
            
        self.tokenizer, self.vocabulary = self.create_vocab(tokenized_corpus, max_word_num)
                            
        # tokenization & add tokens, token indexes to columns
        nltk_tokenizer = MosesTokenizer()
        vectors = []
        for i, text_column in enumerate(['context' , 'question']):
            self.data[text_column + '_tk'] = self.data[text_column].apply(lambda i: nltk_tokenizer.tokenize(i.replace('\n', '').strip(), escape=False))
        
            # token to index
            self.data[text_column+'_tk_index'] = self.tokenizer.texts_to_sequences(self.data[text_column + '_tk'].apply(lambda i: ' '.join(i)))
            
            # padding: It returns context, question vectors.
            vectors.append(pad_sequences(self.data[text_column+'_tk_index'], max_sequence_len[i]))

        return vectors
Example #33
0
 def __init__(self, config_file):
     """Init from yaml"""
     self.config_file = config_file
     util.load_config(self, config_file)
     # Load dictionary
     with open(self.dictionary.dic_file, 'rb') as f:
         self.dic = pickle.load(f)
     # Moses tokenizer
     self.moses_tokenizer = MosesTokenizer(self.options.language)
     # Load subword tokenizer
     self.subword_tokenizer = sentencepiece.SentencePieceProcessor()
     self.subword_tokenizer.Load(self.subwords.model_file)
     # Load language model
     self.lm = kenlm.Model(self.language_model.model_file)
     # Get the percentile of length normalized scores we'll use as a
     # threshold
     norm_train_scores = np.loadtxt(self.language_model.train_scores)[:, 1]
     self.score_threshold = np.percentile(
         norm_train_scores, self.language_model.score_percentile)
Example #34
0
 def __init__(self):
     try:
         from sacremoses import MosesTokenizer
         self._tokenizer = MosesTokenizer()
     except (ImportError, TypeError) as err:
         if isinstance(err, TypeError):
             warnings.warn('The instantiation of MosesTokenizer in sacremoses is'
                           ' currently only supported in python3.'
                           ' Now try NLTKMosesTokenizer using NLTK ...')
         else:
             warnings.warn('sacremoses is not installed. '
                           'To install sacremoses, use pip install -U sacremoses'
                           ' Now try NLTKMosesTokenizer using NLTK ...')
         try:
             from nltk.tokenize.moses import MosesTokenizer
             self._tokenizer = MosesTokenizer()
         except ImportError:
             raise ImportError('NLTK is also not installed. '
                               'You must install NLTK <= 3.2.5 in order to use the '
                               'NLTKMosesTokenizer. You can refer to the official '
                               'installation guide in https://www.nltk.org/install.html .')
Example #35
0
 def __init__(self):
     try:
         from nltk.tokenize.moses import MosesTokenizer
     except ImportError:
         warnings.warn('NLTK or relevant packages are not installed. '
                       'Due to the LGPL 2.1+, moses has been deprecated in NLTK since 3.3.0. '
                       'You must install NLTK <= 3.2.5 in order to use the '
                       'NLTKMosesTokenizer. You can refer to the official '
                       'installation guide in https://www.nltk.org/install.html .'
                       ' Now try SacreMosesTokenizer using sacremoses ...')
         try:
             from sacremoses import MosesTokenizer
         except ImportError:
             raise ImportError('sacremoses is also not installed. '
                               'Please use sacremoses or older nltk version, e.g. 3.2.5. '
                               'To install sacremoses, use pip install -U sacremoses')
     try:
         self._tokenizer = MosesTokenizer()
     except ValueError:
         raise ValueError('The instantiation of MosesTokenizer in sacremoses is'
                          ' currently only supported in python3.')
Example #36
0
class NLTKMosesTokenizer(object):
    r"""Apply the Moses Tokenizer implemented in NLTK.

    Users of this class are required to `install NLTK <https://www.nltk.org/install.html>`_
    and install relevant NLTK packages, such as:

    .. code:: python

        python -m nltk.downloader perluniprops nonbreaking_prefixes

    Examples
    --------
    >>> tokenizer = NLTKMosesTokenizer()
    >>> tokenizer("Gluon NLP toolkit provides a suite of text processing tools.")
    ['Gluon',
     'NLP',
     'toolkit',
     'provides',
     'a',
     'suite',
     'of',
     'text',
     'processing',
     'tools',
     '.']
    >>> tokenizer("Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools "
    ...           "zur Verfügung.")
    ['Das',
     'Gluon',
     'NLP-Toolkit',
     'stellt',
     'eine',
     'Reihe',
     'von',
     'Textverarbeitungstools',
     'zur',
     'Verfügung',
     '.']
    """
    def __init__(self):
        try:
            from nltk.tokenize.moses import MosesTokenizer
        except ImportError:
            raise ImportError('NLTK or relevant packages are not installed. You must install NLTK '
                              'in order to use the NLTKMosesTokenizer. You can refer to the '
                              'official installation guide in https://www.nltk.org/install.html .')
        self._tokenizer = MosesTokenizer()

    def __call__(self, sample):
        """

        Parameters
        ----------
        sample: str
            The sentence to tokenize

        Returns
        -------
        ret : list of strs
            List of tokens
        """
        return self._tokenizer.tokenize(sample)