def test_punkt(self):
     self.assertEqual(tokenize("uh-huh.  /"), "uh-huh . /")
     self.assertEqual(
         tokenize("# [ wh-, + what ] happens if you were to fail?  /"),
         "# [ wh - , + what ] happens if you were to fail ? /")
     self.assertEqual(tokenize("{f huh? } /"), "{f huh ? } /")
     self.assertEqual(tokenize("i hadn't heard that. /"),
                      "i had n't heard that . /")
     self.assertEqual(
         tokenize(
             "{f uh, } as far as my defense budget, {f uh, } they're cutting  it back now, what, twenty-five percent?  /"
         ),
         "{f uh , } as far as my defense budget , {f uh , } they 're cutting it back now , what , twenty-five percent ? /"
     )
 def test_other(self):
     self.assertEqual(
         tokenize(
             "<<talking to someone off the phone>> what do you get for the tickets, if you bring stuff? /"
         ),
         "<<talking_to_someone_off_the_phone>> what do you get for the tickets , if you bring stuff ? /"
     )
 def test_laughter(self):
     self.assertEqual(
         tokenize(
             "{c but } again i'd like to see something on the other <laughter> end back into education. but not in the education we have today.  /",
             laughters=False),
         "{c but } again i 'd like to see something on the other end back into education . but not in the education we have today . /"
     )
Exemple #4
0
 def extract_example(transcript):
     """ Gets the parts we need from the SWDA utterance object """ 
     tags, tags_ints, utts, utts_ints, utts_ints_bert , utts_ints_nl, utts_ints_bert_nl = [], [], [], [], [], [], []
     for utt in transcript.utterances:
         # Regex tokenization
         words = "[SPKR_{}] ".format(utt.caller) + tokenize(utt.text.lower())
         words_nl = remove_laughters(remove_disfluencies(words))
         utts.append(words)
         utts_ints.append(words_to_ints(words.split()))
         utts_ints_nl.append(words_to_ints(words_nl.split()))
         # BERT wordpiece tokenization
         bert_text = "[CLS] [SPKR_{}] ".format(utt.caller) + utt.text
         bert_tokens = bert_tokenizer.tokenize(bert_text) # list of strings
         utts_ints_bert.append(bert_tokenizer.convert_tokens_to_ids(bert_tokens))
         bert_text_nl = remove_laughters(remove_disfluencies(bert_text))
         bert_tokens_nl = bert_tokenizer.tokenize(bert_text_nl)
         utts_ints_bert_nl.append(bert_tokenizer.convert_tokens_to_ids(bert_tokens_nl))
         # dialogue act tags
         tag = damsl_tag_cluster(utt.act_tag)
         tags.append(tag)
         tags_ints.append(tag_to_int(tag))
     return {'id': transcript.conversation_no, 'utts': utts, 'utts_ints': utts_ints, 
             'utts_ints_bert': utts_ints_bert, 'tags': tags, 'tags_ints': tags_ints,
             'utts_ints_bert_nl': utts_ints_bert_nl, 'utts_ints_nl': utts_ints_nl}
 def test_disfl(self):
     self.assertEqual(tokenize("{f huh? } /", disfluencies=False),
                      "huh ? /")
Exemple #6
0
from preproc import tokenize
from skip import word2vec
import pandas as pd 

settings = {
	'window_size': 2,	# context window +- center word
	'n': 10,		# dimensions of word embeddings, also refer to size of hidden layer
	'epochs': 50,		# number of training epochs
	'learning_rate': 0.01	# learning rate
}

raw_tweet_data = pd.read_csv('twitter-dataset-avengersendgame/tweets.csv', encoding='cp1252', index_col=0)

raw_text = raw_tweet_data['text']
corpus = []
for tweet in raw_text:
    corpus.append(tokenize(tweet))

w2v = word2vec()
training_data = w2v.generate_training_data(settings, corpus)