Python tokenize Examples, main.tokenize Python Examples

Example #1

0

Show file

File: tokenize_test.py Project: blackberrybomb/2020-2-level-labs

 def test_tokenize_ideal(self):
     """
     Ideal tokenize scenario
     """
     expected = ['the', 'weather', 'is', 'sunny', 'the', 'man', 'is', 'happy']
     actual = tokenize('The weather is sunny, the man is happy.')
     self.assertEqual(expected, actual)

Example #2

0

Show file

File: tokenize_test.py Project: yudakovaolesya/2020-2-level-labs

 def test_tokenize_several_sentences(self):
     """
     Tokenize text with several sentences
     """
     expected = ['the', 'first', 'sentence', 'the', 'second', 'sentence']
     actual = tokenize('The first sentence. The second sentence.')
     self.assertEqual(expected, actual)

Example #3

0

Show file

File: tokenize_test.py Project: blackberrybomb/2020-2-level-labs

 def test_tokenize_punctuation_marks(self):
     """
     Tokenize text with different punctuation marks
     """
     expected = ['the', 'first', 'sentence', 'nice', 'the', 'second', 'sentence', 'bad']
     actual = tokenize('The, first sentence - nice. The second sentence: bad!')
     self.assertEqual(expected, actual)

Example #4

0

Show file

def filter_(func, lst):
    l = pr.tokenize(lst)
    if l == [""]: return const.FALSE
    return apply(makelist, [
        elm for elm in pr.mapevalconst(l)
        if pr.booltopy(functionisntace(func)(elm))
    ])

Example #5

0

Show file

File: tokenize_test.py Project: yudakovaolesya/2020-2-level-labs

 def test_tokenize_dirty_text(self):
     """
     Tokenize dirty text
     """
     expected = ['the', 'first', 'sentence', 'the', 'second', 'sentence']
     actual = tokenize('The first% sentence><. The sec&*ond sent@ence #.')
     self.assertEqual(expected, actual)

Example #6

0

Show file

File: supervisedSenses.py Project: dyl26/CS4740

def get_senses(probability_map, instance, window_size, penalty_score):
    sense_prob_map = {} # { s, P(s) } Probability map for each possible sense the instance can be
    map_of_senses = probability_map[instance[main.word_tag]]
    for sense in map_of_senses.keys():
        sense_prob_map[sense] = (map_of_senses[sense])[0];  # set initial probability to P(s)
        # map of feature words and probabilities for the sense
        feature_map = (map_of_senses[sense])[1]
        sentence = instance[main.prev] + " " + main.target_token + " " + instance[main.next]
        list_of_features = main.tokenize(sentence)  # feature words for the wordInstance
        list_of_features = supervisedModel.words_in_window(list_of_features, window_size)
        stopwordList = stopwords.words('english')
        punctuation = [".",",",":","'","?","$","-","\""]
        for p in punctuation:
            stopwordList.append(p)
        # remove all the stop words
        for item in list_of_features:
            if item in stopwordList:
                list_of_features.remove(item)
        # stem all the feature words
        for j in range(len(list_of_features)):
            list_of_features[j] = main.stem(list_of_features[j])
        # remove duplicates in the sentence
        list_of_features = list(set(list_of_features))
        # get all the p(f_j|s) from word instance and add it to the sense_prob_map
        for f in list_of_features:
            # if feature word exists in map, update the probability
            if f in feature_map.keys():
                sense_prob_map[sense] *= feature_map[f]
            else:
                sense_prob_map[sense] *= penalty_score
    return sense_prob_map

Example #7

0

Show file

File: tokenize_test.py Project: blackberrybomb/2020-2-level-labs

 def test_tokenize_big_text_case(self):
     """
     Tokenize big input text scenario
     """
     text = read_from_file('lab_1/tokens.txt')
     expected = text.split()
     actual = tokenize(text)
     self.assertEqual(expected, actual)

Example #8

0

Show file

File: tokenize_test.py Project: blackberrybomb/2020-2-level-labs

 def test_tokenize_big_text_length_equal(self):
     """
     Tokenize big input text and assert equal
     """
     text = read_from_file('lab_1/tokens.txt')
     expected = len(text.split())
     actual = len(tokenize(text))
     self.assertEqual(expected, actual)

Example #9

0

Show file

File: tokenize_test.py Project: yudakovaolesya/2020-2-level-labs

 def test_tokenize_bad_input(self):
     """
     Tokenize bad input argument scenario
     """
     bad_inputs = [[], {}, (), None, 9, 9.34, True]
     expected = []
     for bad_input in bad_inputs:
         actual = tokenize(bad_input)
         self.assertEqual(expected, actual)

Example #10

0

Show file

 def test_big_text_get_adjacent_words_term(self):
     """
     Checks if adjacent words for a given term can be found properly
     """
     text = read_from_file('lab_1/data.txt')
     tokens = tokenize(text)
     expected = [['although', 'products']]
     actual = get_adjacent_words(tokens, 'tex', 4, 31)
     self.assertEqual(expected, actual)

Example #11

0

Show file

File: functions_new.py Project: moratori/mylisp

 def FUNCTION(*arg):
     #ここで束縛されるargは実引数となる
     #dmyargは"(x , y , z)"みたいな仮引数
     if dmyarg == "nil": 
         lst = [pr.eval(each , scope) for each in body]
     else:
         s = pr.joindict({e:arg[i]for i,e in enumerate(pr.tokenize(dmyarg))},scope) 
         lst = [pr.eval(each , s) for each in body]
     return lst.pop()

Example #12

0

Show file

 def FUNCTION(*arg):
     #ここで束縛されるargは実引数となる
     #dmyargは"(x , y , z)"みたいな仮引数
     if dmyarg == "nil":
         lst = [pr.eval(each, scope) for each in body]
     else:
         s = pr.joindict(
             {e: arg[i]
              for i, e in enumerate(pr.tokenize(dmyarg))}, scope)
         lst = [pr.eval(each, s) for each in body]
     return lst.pop()

Example #13

0

Show file

File: supervisedModel.py Project: dyl26/CS4740

def update_feature_map(feature_map, word, item):
    #get all feature words
    sentence = item[main.prev]+" "+item[main.next]
    list_of_features = main.tokenize(sentence)
    #stem all words in the sentence
    for i in range(len(list_of_features)):
        list_of_features[i] = main.stem(list_of_features[i])
    #remove duplicates in the sentence
    list_of_features = list(set(list_of_features))
    for f in list_of_features:
        increment_map_value(feature_map, f)

Example #14

0

Show file

    def test_get_adjacent_words_several_contexts_big_text(self):
        """
        Checks if adjacent words for a given term can be found in real text properly
        """
        text = read_from_file('lab_1/data.txt')
        tokens = tokenize(text)

        expected = [['epithelial', 'channels'], ['means', 'aluminate'],
                    ['by', 'bicarbonate'], ['the', 'salt']]
        actual = get_adjacent_words(tokens, 'sodium', 1, 1)
        self.assertEqual(expected, actual)

Example #15

0

Show file

    def test_big_text_get_and_sort_concordance_term(self):
        """
        Checks if a context sorts right for a given term and can be found properly
        """
        text = read_from_file('lab_1/data.txt')
        tokens = tokenize(text)

        expected = [['although', 'less', 'compact', 'than', 'tex', 'the',
                     'xml', 'structuring', 'promises', 'to', 'make', 'it',
                     'widely', 'usable', 'and', 'allows', 'for', 'instant',
                     'display']]
        actual = sort_concordance(tokens, 'tex', 4, 14, True)
        self.assertEqual(expected, actual)

Example #16

0

Show file

    def test_get_concordance_several_contexts_big_text_right(self):
        """
        Checks if contexts for a given term can be found in real text properly
        Taking into consideration right context
        """
        text = read_from_file('lab_1/data.txt')
        tokens = tokenize(text)

        expected = [['means', 'sodium', 'aluminate'],
                    ['by', 'sodium', 'bicarbonate'],
                    ['epithelial', 'sodium', 'channels'],
                    ['the', 'sodium', 'salt']]
        actual = sort_concordance(tokens, 'sodium', 1, 1, False)
        self.assertEqual(expected, actual)

Example #17

0

Show file

File: get_concordance_test.py Project: yudakovaolesya/2020-2-level-labs

    def test_big_text_get_concordance_term(self):
        """
        Checks if a context for a given term can be found properly
        """
        text = read_from_file('lab_1/data.txt')
        tokens = tokenize(text)

        expected = [['although', 'less', 'compact', 'than', 'tex', 'the',
                     'xml', 'structuring', 'promises', 'to', 'make', 'it',
                     'widely', 'usable', 'and', 'allows', 'for', 'instant',
                     'display', 'in', 'applications', 'such', 'as', 'web',
                     'browsers', 'and', 'facilitates', 'an', 'interpretation',
                     'of', 'its', 'meaning', 'in', 'mathematical', 'software', 'products']]
        actual = get_concordance(tokens, 'tex', 4, 31)
        self.assertEqual(expected, actual)

Example #18

0

Show file

File: evaluate.py Project: Divine-Deep-Learning/transformer-syllabification

def evaluate(sentence, two_way_X, two_way_y, max_length=300):
    encoder_input = tf.cast(tf.convert_to_tensor([tokenize(two_way_X, sentence)]), tf.int64)
    start, end = two_way_y.get('<start>'), two_way_y.get('<end>')
    output = tf.convert_to_tensor([start])
    output = tf.expand_dims(output, 0)
    output = tf.cast(output, tf.int64)

    for i in range(max_length):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
            encoder_input, output)
        # predictions.shape == (batch_size, seq_len, vocab_size)
        predictions, attention_weights = transformer(encoder_input,
                                                     output,
                                                     False,
                                                     enc_padding_mask,
                                                     combined_mask,
                                                     dec_padding_mask)
        # select the last word from the seq_len dimension
        predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)
        """ CHANGE START: decomment only one of them """
        """ 1 - ORIGINAL"""
        predicted_id = tf.argmax(predictions, axis=-1)
        """ 2 - MODIFIED"""
        """
        predicted_id_orig = tf.argmax(predictions, axis=-1)
        count = 0
        while True:
            if count == 5:
                predicted_id = predicted_id_orig
                break
            predicted_id = tf.argmax(predictions, axis=-1)
            # concatentate the predicted_id to the output which is given to the decoder as its input.
            if check_next_syl(two_way_y, copy.deepcopy(predicted_id), output, sentence):
                break
            predictions = predictions.numpy()
            predictions[:, :, predicted_id.numpy()] = -100
            predictions = tf.convert_to_tensor(predictions)
            count += 1
            """
        """ CHANGE STOP """

        output = tf.concat([output, predicted_id], axis=-1)
        # return the result if the predicted_id is equal to the end token
        if predicted_id == end:
            break
    # output.shape (1, tokens)
    text = detokenize(two_way_y, output)
    return text, attention_weights

Example #19

0

Show file

def last(lst):
    l = pr.tokenize(lst)
    #nilだったらnil
    return const.FALSE if l[0] == "" else l[len(l) - 1]

Example #20

0

Show file

    intents = json.load(f)

# print(intents)
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from model import NeuralNet
all_words = []
tags = []
xy = []
for intent in intents['intents']:
    tag = intent['tag']
    tags.append(tag)
    for pattern in intent['patterns']:
        w = tokenize(pattern)
        all_words.extend(w)
        xy.append((w, tag))

ignore_words = ['?', '!', '.', ',', '¿', '¡']
all_words = [stem(w) for w in all_words if w not in ignore_words]
all_words = sorted(set(all_words))
tags = sorted(set(tags))
# print(tags)

X_train = []
y_train = []

for (pattern_sentence, tag) in xy:
    bag = bag_of_words(pattern_sentence, all_words)
    X_train.append(bag)

Example #21

0

Show file

def init(lst):
    l = pr.tokenize(lst)
    return apply(makelist, l) if l[0] == "" else apply(makelist, l[0:len(l) -
                                                                   1])

Example #22

0

Show file

def map_(func, lst):
    l = pr.tokenize(lst)
    #nilだったらnil
    if l == [""]: return const.FALSE
    #tokenizeでばらした要素そのままでは文字列のままである
    return apply(makelist, map(functionisntace(func), pr.mapevalconst(l)))

Example #23

0

Show file

File: start.py Project: katerinamerkulova/concordance_extraction

"""
Concordance implementation starter
"""

import os
import main

if __name__ == '__main__':
    current_dir = os.path.dirname(os.path.abspath(__file__))
    data = main.read_from_file(os.path.join(current_dir, 'data.txt'))
    stop_words = main.read_from_file(
        os.path.join(current_dir, 'stop_words.txt'))

    tokens = main.tokenize(data)
    print(f'Raw text: {data[:5]}')
    print(f'Tokenized text: {tokens[:5]}')

    tokens = main.remove_stop_words(tokens, stop_words)
    print(f'Text without stop-words: {tokens[:5]}')

    frequencies = main.calculate_frequencies(tokens[:5000])
    print(f'Frequencies: {frequencies[tokens[0]]}')

    word = 'dog'
    concordance = main.get_concordance(tokens, word, 2, 0)
    print(f'The concordance for {word}: {concordance[:5]}')

    adjacent = main.get_adjacent_words(tokens, 'dog', 2, 0)
    print(f'Adjacent words: {adjacent[:5]}')

    sorted_concordance = main.sort_concordance(tokens, 'dog', 2, 0, True)

Example #24

0

Show file

File: functions_new.py Project: moratori/mylisp

def map_(func , lst ):
    l = pr.tokenize(lst)
    #nilだったらnil
    if l == [""] : return const.FALSE
    #tokenizeでばらした要素そのままでは文字列のままである
    return apply(makelist , map(functionisntace(func) , pr.mapevalconst(l)))

Example #25

0

Show file

File: functions_new.py Project: moratori/mylisp

    "tan"     :(True , math.tan , False),
    
    "+"       :(True , (lambda *arg: reduce((lambda x,y: x+y) , arg)) , False),
    "-"       :(True , (lambda *arg: reduce((lambda x,y: x-y) , arg)) , False),
    "*"       :(True , (lambda *arg: reduce((lambda x,y: x*y) , arg)) , False),
    "/"       :(True , (lambda *arg: reduce((lambda x,y: x/y) , arg)) , False),
    ">"       :(True , (lambda x,y: pr.booltolisp(x>y)) , False),
    ">="      :(True , (lambda x,y: pr.booltolisp(x>=y)), False),
    "<"       :(True , (lambda x,y: pr.booltolisp(x<y)) , False),
    "<="      :(True , (lambda x,y: pr.booltolisp(x<=y)) , False),
    "="       :(True , (lambda x,y: pr.booltolisp(x == y)) , False),
    "and"     :(True , (lambda x,y: pr.booltolisp(pr.booltopy(x) and pr.booltopy(y))) , False),
    "or"      :(True , (lambda x,y: pr.booltolisp(pr.booltopy(x) or pr.booltopy(y))) , False),
    "not"     :(True , (lambda x: pr.booltolisp(not pr.booltopy(x)) ) , False),
  
    "cons"    :(True , (lambda x,y: apply(makelist,[x] + pr.tokenize(y))) , False),
    "car"     :(True , (lambda x:pr.tokenize(x)[0]) , False),
    "cdr"     :(True , (lambda x:apply(makelist ,pr.tokenize(x)[1:])) , False),
    "list"    :(True , makelist , False),
    "last"    :(True , last , False),
    "length"  :(True , (lambda x: len(pr.tokenize(x))) , False),
    "init"    :(True , init , False),
    "map"     :(True , map_ , False),
    "filter"  :(True , filter_  , False),

    "list?"   :(True , (lambda x:pr.booltolisp(pr.W_islist(x))) , False),
    "atom?"   :(True , (lambda x:pr.booltolisp(pr.W_isatom(x))) , False),
    "symbol?" :(True , (lambda x:pr.booltolisp(pr.W_issymbol(x))) , False),
    "null?"   :(True , (lambda x:pr.booltolisp(pr.W_isnil(x))) , False),
    "equal?"  :(True , (lambda x,y: pr.booltolisp(x == y)) , False),

Example #26

0

Show file

File: functions_new.py Project: moratori/mylisp

def init(lst):
    l = pr.tokenize(lst)
    return apply(makelist , l) if l[0] == "" else apply(makelist , l[0:len(l)-1])

Example #27

0

Show file

    "tan"     :(True , math.tan , False),

    "+"       :(True , (lambda *arg: reduce((lambda x,y: x+y) , arg)) , False),
    "-"       :(True , (lambda *arg: reduce((lambda x,y: x-y) , arg)) , False),
    "*"       :(True , (lambda *arg: reduce((lambda x,y: x*y) , arg)) , False),
    "/"       :(True , (lambda *arg: reduce((lambda x,y: x/y) , arg)) , False),
    ">"       :(True , (lambda x,y: pr.booltolisp(x>y)) , False),
    ">="      :(True , (lambda x,y: pr.booltolisp(x>=y)), False),
    "<"       :(True , (lambda x,y: pr.booltolisp(x<y)) , False),
    "<="      :(True , (lambda x,y: pr.booltolisp(x<=y)) , False),
    "="       :(True , (lambda x,y: pr.booltolisp(x == y)) , False),
    "and"     :(True , (lambda x,y: pr.booltolisp(pr.booltopy(x) and pr.booltopy(y))) , False),
    "or"      :(True , (lambda x,y: pr.booltolisp(pr.booltopy(x) or pr.booltopy(y))) , False),
    "not"     :(True , (lambda x: pr.booltolisp(not pr.booltopy(x)) ) , False),

    "cons"    :(True , (lambda x,y: apply(makelist,[x] + pr.tokenize(y))) , False),
    "car"     :(True , (lambda x:pr.tokenize(x)[0]) , False),
    "cdr"     :(True , (lambda x:apply(makelist ,pr.tokenize(x)[1:])) , False),
    "list"    :(True , makelist , False),
    "last"    :(True , last , False),
    "length"  :(True , (lambda x: len(pr.tokenize(x))) , False),
    "init"    :(True , init , False),
    "map"     :(True , map_ , False),
    "filter"  :(True , filter_  , False),

    "list?"   :(True , (lambda x:pr.booltolisp(pr.W_islist(x))) , False),
    "atom?"   :(True , (lambda x:pr.booltolisp(pr.W_isatom(x))) , False),
    "symbol?" :(True , (lambda x:pr.booltolisp(pr.W_issymbol(x))) , False),
    "null?"   :(True , (lambda x:pr.booltolisp(pr.W_isnil(x))) , False),
    "equal?"  :(True , (lambda x,y: pr.booltolisp(x == y)) , False),

Example #28

0

Show file

"""
Concordance implementation starter
"""

import os
import main

if __name__ == '__main__':
    #  use data.txt file to test your program
    current_dir = os.path.dirname(os.path.abspath(__file__))
    data = main.read_from_file(os.path.join(current_dir, 'data.txt'))
    stop_words = main.read_from_file(
        os.path.join(current_dir, 'stop_words.txt')).split('\n')

    #  here goes your logic: calling methods from concordance.py
    tokens = main.tokenize(data)
    print('tokens:', tokens[:10])
    print('\n-----------------------------\n')

    tokens = main.remove_stop_words(tokens,
                                    stop_words)  # old: 34 sec, new - 3.4 sec
    print('tokens without stop words:', tokens[:10])
    print('\n-----------------------------\n')

    frequencies = main.calculate_frequencies(
        tokens)  # old: 116 sec, new: ~81 sec
    print('frequency for the first word:', frequencies[tokens[0]])
    print('\n-----------------------------\n')

    top_10 = main.get_top_n_words(frequencies, 10)
    print('top 10 words:', top_10)

Example #29

0

Show file

File: functions_new.py Project: moratori/mylisp

def last(lst):
    l = pr.tokenize(lst)
    #nilだったらnil
    return const.FALSE if l[0] == "" else l[len(l)-1]

Example #30

0

Show file

File: functions_new.py Project: moratori/mylisp

def filter_(func , lst):
    l = pr.tokenize(lst)
    if l == [""] : return const.FALSE
    return apply(makelist , [elm for elm in pr.mapevalconst(l) if pr.booltopy(functionisntace(func)(elm))])

Example #31

0

Show file

tags = data['tags']
model_state = data['model_state']

model = NeuralNet(input_size, hidden_size, output_size).to(device)
model.load_state_dict(model_state)
model.eval()

bot_name = "Hamdi"
user_name = input("Merhaba, adın ne?: ")

while True:
    sentence = input('{}: '.format(user_name))
    if sentence == 'quit':
        break

    sentence = tokenize(sentence)
    X = bag_of_words(sentence, all_words)
    X = X.reshape(1, X.shape[0])
    X = torch.from_numpy(X).to(device)

    output = model(X)

    _, predicted = torch.max(output, dim=1)
    tag = tags[predicted.item()]

    probs = torch.softmax(output, dim=1)
    prob = probs[0][predicted.item()]

    if prob.item() > 0.50:
        for intent in intents['intents']:
            if tag == intent['tag']:

Example #32

0

Show file

    import tqdm
    from nlp import load_dataset


    with open('intents.json', 'r') as f:
        intents = json.load(f)

    all_words = []
    tags = []
    match = []

    for intent in intents['intents']:
        tag = intent['tag']
        tags.append(tag)
        for pattern in intent['patterns']:
            tokenized_sentence = tokenize(pattern)
            all_words.extend(tokenized_sentence)
            match.append((tokenized_sentence, tag))

    ignored_words = ['?', '.', '!']
    all_words = [stem(w) for w in all_words if w not in ignored_words]
    all_words = sorted(set(all_words))
    tags = sorted(set(tags))

    match_1 = []
    match_2 = []

    for (pattern_sentence, tag) in match:
        bag = bag_of_words(pattern_sentence, all_words)
        match_1.append(bag)