Exemple #1
0
def text_to_word_sequence(input_text,
                          filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                          lower=True,
                          split=' '):
  """Converts a text to a sequence of words (or tokens).

  This function transforms a string of text into a list of words
  while ignoring `filters` which include punctuations by default.

  >>> sample_text = 'This is a sample sentence.'
  >>> tf.keras.preprocessing.text.text_to_word_sequence(sample_text)
  ['this', 'is', 'a', 'sample', 'sentence']

  Arguments:
      input_text: Input text (string).
      filters: list (or concatenation) of characters to filter out, such as
          punctuation. Default: `'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\\t\\n'`,
            includes basic punctuation, tabs, and newlines.
      lower: boolean. Whether to convert the input to lowercase.
      split: str. Separator for word splitting.

  Returns:
      A list of words (or tokens).
  """
  return text.text_to_word_sequence(
      input_text, filters=filters, lower=lower, split=split)
Exemple #2
0
    def texts_to_sequences_generator(self, texts):
        """Transforms each text in `texts` in a sequence of integers.

        Each item in texts can also be a list,
        in which case we assume each item of that list to be a token.

        Only top "num_words" most frequent words will be taken into account.
        Only words known by the tokenizer will be taken into account.

        # Arguments
            texts: A list of texts (strings).

        # Yields
            Yields individual sequences.
        """
        num_words = self.num_words
        for text in texts:
            if self.char_level or isinstance(text, list):
                seq = text
            else:
                seq = text_to_word_sequence(text, self.filters, self.lower,
                                            self.split)
            vect = []
            for w in seq:
                vect.append(self.resolve_word_or_oov(w))
            yield vect
    def fit_on_texts(self, texts):
        """Updates internal vocabulary based on a list of texts.

                In the case where texts contains lists,
                we assume each entry of the lists to be a token.

                Required before using `texts_to_sequences` or `texts_to_matrix`.

                # Arguments
                    texts: can be a list of strings,
                        a generator of strings (for memory-efficiency),
                        or a list of list of strings.
                """
        for text in texts:
            self.document_count += 1
            if self.char_level or isinstance(text, list):
                if self.lower:
                    if isinstance(text, list):
                        text = [text_elem.lower() for text_elem in text]
                    else:
                        text = text.lower()
                seq = text
            else:
                seq = text_to_word_sequence(text, self.filters, self.lower,
                                            self.split)
            for w in seq:
                if w in self.word_counts:
                    self.word_counts[w] += 1
                else:
                    self.word_counts[w] = 1
            for w in set(seq):
                # In how many documents each word occurs
                self.word_docs[w] += 1

        wcounts = list(self.word_counts.items())
        wcounts.sort(key=lambda x: x[1], reverse=True)

        sorted_voc = []

        if self.start_token is not None:
            sorted_voc.append(self.start_token)

        if self.end_token is not None:
            sorted_voc.append(self.end_token)

        if self.oov_token is not None:
            sorted_voc.append(self.oov_token)

        sorted_voc.extend(wc[0] for wc in wcounts)

        # note that index 0 is reserved, never assigned to an existing word
        self.word_index = dict(
            list(zip(sorted_voc, list(range(1,
                                            len(sorted_voc) + 1)))))

        self.index_word = dict((c, w) for w, c in self.word_index.items())

        for w, c in list(self.word_docs.items()):
            self.index_docs[self.word_index[w]] = c
Exemple #4
0
 def texts_to_sequences(self, texts):
     sentence_features = np.zeros(
         (len(texts), self.padding, self.e.get_dimension()))
     for j, text in enumerate(texts):
         words = text_to_word_sequence(text)
         for i, word in enumerate(words):
             if i >= self.padding:
                 break
             sentence_features[j, i, :] = self.e.get_word_vector(word)
     return sentence_features
Exemple #5
0
    def run(self):
        with open(self.xml, encoding="utf-8") as fd:
            tree = xmltodict.parse(fd.read(), xml_attribs=False, force_list=True)
            document = getFullText(tree)
            doc_id = search(tree, "id")[0]
            text = clean_text(document)
            words = text_to_word_sequence(text)
            filtered_doc = [w.lower() for w in words if not w in self.stop_words and w != '' and w.isalpha() and len(w)>1]

            self.corpus[doc_id] = dict((i, j) for (i, j) in nltk.Counter(filtered_doc).items())
            self.corpusWcount[doc_id] = filtered_doc
def get_word2vec_embedding(data):
    sentences_words = list(
        text_to_word_sequence(x, filters=[], lower=False) for x in data)
    model = Word2Vec(sentences_words,
                     size=embedding_length,
                     workers=4,
                     min_count=1,
                     sg=1,
                     hs=1,
                     iter=5)

    print("Number of word vectors: {}".format(len(model.wv.vocab)))
    return model.wv
Exemple #7
0
    def read_sequence_from_file(self, filename):
        def get_mapping(word):
            if word in self.word_to_id.keys():
                mapping = self.word_to_id[word]
                if self.skip_top <= mapping < self.max_features:
                    return mapping
            # Out of vocabulary char
            return 2

        with open(filename) as file:
            word_seq = text_to_word_sequence(file.read())

        index_seq = [get_mapping(word) for word in word_seq]
        index_seq.insert(0, 1)

        return index_seq
Exemple #8
0
def create_corpus(output_file_name, data):
    size = len(data)
    output_corpus = open(output_file_name, "w", encoding="utf-8")
    my_filters = '"#$&()*+/:;<=>?@[\\]^_`{|}~\t\n'

    with click.progressbar(length=len_data, label="CREATE CORPUS: ", fill_char=click.style('=', fg='white')) as bar:
        for i in range(0, len_data):
            tmp = str(data[i]).lower()
            tmp = text.text_to_word_sequence(text=tmp,
                                             filters=my_filters)
            tmp = " ".join(map(str, tmp))
            output_corpus.write(tmp + "\n")

            bar.update(1)

    output_corpus.close()
    def texts_to_sequences_generator(self, texts):
        """Transforms each text in `texts` to a sequence of integers.

        Each item in texts can also be a list,
        in which case we assume each item of that list to be a token.

        Only top "num_words" most frequent words will be taken into account.
        Only words known by the tokenizer will be taken into account.

        # Arguments
            texts: A list of texts (strings).

        # Yields
            Yields individual sequences.
        """
        num_words = self.num_words
        oov_token_index = self.word_index.get(self.oov_token)
        end_token_index = self.word_index.get(self.end_token)
        start_token_index = self.word_index.get(self.start_token)
        for text in texts:
            if self.char_level or isinstance(text, list):
                if self.lower:
                    if isinstance(text, list):
                        text = [text_elem.lower() for text_elem in text]
                    else:
                        text = text.lower()
                seq = text
            else:
                seq = text_to_word_sequence(text, self.filters, self.lower,
                                            self.split)
            vect = []
            if self.start_token is not None:
                vect.append(start_token_index)

            for w in seq:
                i = self.word_index.get(w)
                if i is not None:
                    if num_words and i >= num_words:
                        if oov_token_index is not None:
                            vect.append(oov_token_index)
                    else:
                        vect.append(i)
                elif self.oov_token is not None:
                    vect.append(oov_token_index)
            if self.end_token is not None:
                vect.append(end_token_index)
            yield vect
Exemple #10
0
    def fit_on_texts(self, texts):
        """Updates internal vocabulary based on a list of texts.

        In the case where texts contains lists,
        we assume each entry of the lists to be a token.

        Required before using `texts_to_sequences` or `texts_to_matrix`.

        # Arguments
            texts: can be a list of strings,
                a generator of strings (for memory-efficiency),
                or a list of list of strings.
        """
        for text in texts:
            self.document_count += 1
            if self.char_level or isinstance(text, list):
                seq = text
            else:
                seq = text_to_word_sequence(text, self.filters, self.lower,
                                            self.split)
            for w in seq:
                if w in self.word_counts:
                    self.word_counts[w] += 1
                else:
                    self.word_counts[w] = 1
            for w in set(seq):
                if w in self.word_docs:
                    self.word_docs[w] += 1
                else:
                    self.word_docs[w] = 1

        wcounts = list(self.word_counts.items())
        wcounts.sort(key=lambda x: x[1], reverse=True)
        sorted_voc = [wc[0] for wc in wcounts]
        # note that indices 0 and 1 are reserved, never assigned to an existing word
        self.word_index = dict(
            list(zip(sorted_voc, list(range(2,
                                            len(sorted_voc) + 2)))))

        # index 1 is reserved for the oov token
        if self.oov_token is not None:
            i = self.word_index.get(self.oov_token)
            if i is None:
                self.word_index[self.oov_token] = 1

        for w, c in list(self.word_docs.items()):
            self.index_docs[self.word_index[w]] = c
Exemple #11
0
def prep_1(text):
    text = "The quick brown fox jumped over the lazy dog."

    list_unique_words = list(set(text_to_word_sequence(text)))
    print(f"docs: {list_unique_words[:100]}")

    vocab_size = len(list_unique_words)
    print(f"vocab_size: {vocab_size}")

    oh_encoding = one_hot(text, n=round(vocab_size * 1.3))
    print(f"oh_encoding: {oh_encoding}")

    hashed_doc = hashing_trick(text,
                               n=round(vocab_size * 1.3),
                               hash_function='md5')
    print(f"hashed_doc: {hashed_doc}")

    return oh_encoding
Exemple #12
0
def process(tweet):
    stop_words = get_stop_words()
    base_filters = '\n\t!"#$%&()*+,-–./:;<=>?[\]^_`{|}~ 0123456789'

    tweet = str(tweet)
    tweet = re.sub(r'^RT[\s]+', '',
                   tweet)  # remove old style retweet text "RT"
    tweet = re.sub(
        r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?',
        '', tweet)  # remove hyperlinks
    tweet = tweet.replace('\'', '')
    new_list = [
        x
        for x in text_to_word_sequence(tweet, filters=base_filters, lower=True)
        if not x.startswith("@")
    ]
    final = [w for w in new_list if not w in stop_words]
    return final
Exemple #13
0
    def get_query_and_candidate_terms(self, sequence_length=20):

        stop_words = set(stopwords.words('english'))

        query_texts = list()
        candidate_terms = list()
        query_terms_texts = list()
        keyword_terms_texts = list()

        for query in self.query_list:
            query_texts.append(query.query)

            terms = [''] * (sequence_length * 2)
            query_terms = text_to_word_sequence(query.query)
            terms[:sequence_length] = [
                term for term in query_terms if term not in stop_words
            ][:sequence_length]
            terms[sequence_length:] = [
                keyword for _, keyword in query.keywords
            ][:sequence_length]
            candidate_terms.append(terms)

            query_terms_texts.append(' '.join(terms[:sequence_length]))
            keyword_terms_texts.append(' '.join(terms[sequence_length:]))

        query_sequence = self.tokenizer.texts_to_sequences(query_texts)
        query_sequence = pad_sequences(query_sequence,
                                       maxlen=sequence_length * 2)

        query_terms_sequence = self.tokenizer.texts_to_sequences(
            query_terms_texts)
        query_terms_sequence = pad_sequences(query_terms_sequence,
                                             maxlen=sequence_length,
                                             padding='post')
        keyword_terms_sequence = self.tokenizer.texts_to_sequences(
            keyword_terms_texts)
        keyword_terms_sequence = pad_sequences(keyword_terms_sequence,
                                               maxlen=sequence_length)

        terms_sequence = np.hstack(
            [query_terms_sequence, keyword_terms_sequence])

        return self.query_list, query_sequence, terms_sequence, candidate_terms
Exemple #14
0
    def s_r(self, t, r, stop_words, size=80, node=None):
        if isinstance(t, dict):
            for (k, v) in t.items():
                for i, j in enumerate(v, 0):
                    words = text_to_word_sequence(clean_text(getFullText(j)))
                    words = [
                        w.lower() for w in words if not w in stop_words
                        and w != '' and w.isalpha() and len(w) > 1
                    ]
                    node = node if node != None else "/"
                    if len(words) > size:
                        if k == "#text":
                            r[node] = words
                        else:
                            r["{}/{}[{}]".format(node, k, i + 1)] = words
                        self.s_r(j,
                                 r,
                                 stop_words,
                                 size,
                                 node="{}/{}[{}]".format(node, k, i + 1))

        return r
Exemple #15
0
def test_text_to_word_sequence_unicode():
    sample_text = u'ali! veli? kırk dokuz elli'
    assert text.text_to_word_sequence(sample_text) == [
        u'ali', u'veli', u'kırk', u'dokuz', u'elli'
    ]
Exemple #16
0
def test_text_to_word_sequence_multichar_split():
    sample_text = 'hello!stop?world!'
    assert text.text_to_word_sequence(sample_text,
                                      split='stop') == ['hello', 'world']
Exemple #17
0
def test_text_to_word_sequence():
    sample_text = 'hello! ? world!'
    assert text.text_to_word_sequence(sample_text) == ['hello', 'world']
Exemple #18
0
def default_tokenizer(text: str) -> List[str]:
    """Default function to tokenize text."""
    return text_to_word_sequence(text,
                                 lower=False,
                                 filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
Exemple #19
0
def embed(x_lstm, y, x_fe, x_test, x_test_lstm, y_test, needs_embed):
    """
    Embeds the text in the training and test set using the wiki-news pre-trained vectors.
    :param x_lstm: training set text
    :param y: training set labels
    :param x_fe: training set features
    :param x_test: test set features
    :param x_test_lstm: test set text
    :param y_test: test set labels
    :param needs_embed: boolean if you want to embed the text
    -> text should be embedded if the project does not include "embedded_matrix",
    "embedded_text" and "embedded_text_test" files.
    To embed, "wiki-news-300d-1M.vec" should be included in the project
    """
    max_len = 197

    if needs_embed:
        #for train and validation set
        word2index, embedding_matrix = load_embeddings('wiki-news-300d-1M.vec',
                                                       embedding_dim=300)

        out_matrix = []

        for text in x_lstm['text'].tolist():
            indices = []
            for w in text_to_word_sequence(text):
                indices.append(word2index[re.sub(r'[^\w\s]', '', w)])
            if len(indices) > max_len:
                max_len = len(indices)
            out_matrix.append(indices)

        encoded_texts = out_matrix

        padded_texts = pad_sequences(encoded_texts,
                                     maxlen=max_len,
                                     padding='post')

        store_data(padded_texts, 'embedded_text')
        store_data(embedding_matrix, 'embedded_matrix')

        # for test set
        word2index, embedding_matrix = load_embeddings('wiki-news-300d-1M.vec',
                                                       embedding_dim=300)

        out_matrix = []

        for text in x_test_lstm['text'].tolist():
            indices = []
            for w in text_to_word_sequence(text):
                # Scotfree is present in the data, but it is not in wiki-news and
                # throws error
                if w == 'scotfree':
                    continue
                indices.append(word2index[re.sub(r'[^\w\s]', '', w)])
            if len(indices) > max_len:
                max_len = len(indices)
            out_matrix.append(indices)

        encoded_texts = out_matrix
        padded_texts = pad_sequences(encoded_texts,
                                     maxlen=max_len,
                                     padding='post')
        store_data(padded_texts, 'embedded_text_test')

    embedding_matrix = load_data('embedded_matrix')
    padded_texts = load_data('embedded_text')
    embedding_matrix_test = load_data('embedded_text_test')
    for idx, el in enumerate(padded_texts):
        dataframes[0]['text'][idx] = el

    for idx, el in enumerate(embedding_matrix_test):
        dataframes[3]['text'][idx] = el
    x_test_lstm = dataframes[3]['text']
    do_kfold_validation(x_fe, y, embedding_matrix, max_len, x_test,
                        x_test_lstm, y_test)
Exemple #20
0
                             sep=',',
                             header=0,
                             encoding='latin-1')
else:
    raise ValueError(
        "Inputfile is missing! Invalid action or not existing \"existingModelPath\" need!"
    )

print(input_data['target'].value_counts())

#Tokenize Tweets
tweets = input_data['text'].tolist()

train_sequences = []
for tweet in tweets:
    train_sequences.append(text_to_word_sequence(str(tweet)))

if str(args.embedding).lower() == 'bert':

    print('Using BERT embedding')
    if args.action == 'new':
        print('Training new Model')

        train_x, test_x, train_y, test_y = train_test_split(
            train_sequences,
            input_data['target'],
            test_size=0.1,
            random_state=1234)
        train_x, valid_x, train_y, valid_y = train_test_split(
            train_x, train_y, test_size=0.1, random_state=1234)
Exemple #21
0
def test_text_to_word_sequence_unicode_multichar_split():
    sample_text = u'ali!stopveli?stopkırkstopdokuzstopelli'
    assert text.text_to_word_sequence(sample_text, split='stop') == [
        u'ali', u'veli', u'kırk', u'dokuz', u'elli'
    ]
Exemple #22
0
def keras_tokenize(text):
    text = clean_text(text)
    tokens = text_to_word_sequence(text)
    return tokens
Exemple #23
0
from keras_preprocessing.text import text_to_word_sequence
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.text import one_hot

text = "Hei, dette er noe testtext"

tronder_file = open("TextInput/rawText.txt", "r", encoding="utf-8")
tronder_text = tronder_file.read()
tronder_file.close()

one_hot_result = one_hot(tronder_text, len(tronder_text))
ttws_result = text_to_word_sequence(tronder_text)

print(ttws_result)
print(one_hot_result)
print(len(ttws_result))
print(len(one_hot_result))