def test_punkt(self): self.assertEqual(tokenize("uh-huh. /"), "uh-huh . /") self.assertEqual( tokenize("# [ wh-, + what ] happens if you were to fail? /"), "# [ wh - , + what ] happens if you were to fail ? /") self.assertEqual(tokenize("{f huh? } /"), "{f huh ? } /") self.assertEqual(tokenize("i hadn't heard that. /"), "i had n't heard that . /") self.assertEqual( tokenize( "{f uh, } as far as my defense budget, {f uh, } they're cutting it back now, what, twenty-five percent? /" ), "{f uh , } as far as my defense budget , {f uh , } they 're cutting it back now , what , twenty-five percent ? /" )
def test_other(self): self.assertEqual( tokenize( "<<talking to someone off the phone>> what do you get for the tickets, if you bring stuff? /" ), "<<talking_to_someone_off_the_phone>> what do you get for the tickets , if you bring stuff ? /" )
def test_laughter(self): self.assertEqual( tokenize( "{c but } again i'd like to see something on the other <laughter> end back into education. but not in the education we have today. /", laughters=False), "{c but } again i 'd like to see something on the other end back into education . but not in the education we have today . /" )
def extract_example(transcript): """ Gets the parts we need from the SWDA utterance object """ tags, tags_ints, utts, utts_ints, utts_ints_bert , utts_ints_nl, utts_ints_bert_nl = [], [], [], [], [], [], [] for utt in transcript.utterances: # Regex tokenization words = "[SPKR_{}] ".format(utt.caller) + tokenize(utt.text.lower()) words_nl = remove_laughters(remove_disfluencies(words)) utts.append(words) utts_ints.append(words_to_ints(words.split())) utts_ints_nl.append(words_to_ints(words_nl.split())) # BERT wordpiece tokenization bert_text = "[CLS] [SPKR_{}] ".format(utt.caller) + utt.text bert_tokens = bert_tokenizer.tokenize(bert_text) # list of strings utts_ints_bert.append(bert_tokenizer.convert_tokens_to_ids(bert_tokens)) bert_text_nl = remove_laughters(remove_disfluencies(bert_text)) bert_tokens_nl = bert_tokenizer.tokenize(bert_text_nl) utts_ints_bert_nl.append(bert_tokenizer.convert_tokens_to_ids(bert_tokens_nl)) # dialogue act tags tag = damsl_tag_cluster(utt.act_tag) tags.append(tag) tags_ints.append(tag_to_int(tag)) return {'id': transcript.conversation_no, 'utts': utts, 'utts_ints': utts_ints, 'utts_ints_bert': utts_ints_bert, 'tags': tags, 'tags_ints': tags_ints, 'utts_ints_bert_nl': utts_ints_bert_nl, 'utts_ints_nl': utts_ints_nl}
def test_disfl(self): self.assertEqual(tokenize("{f huh? } /", disfluencies=False), "huh ? /")
from preproc import tokenize from skip import word2vec import pandas as pd settings = { 'window_size': 2, # context window +- center word 'n': 10, # dimensions of word embeddings, also refer to size of hidden layer 'epochs': 50, # number of training epochs 'learning_rate': 0.01 # learning rate } raw_tweet_data = pd.read_csv('twitter-dataset-avengersendgame/tweets.csv', encoding='cp1252', index_col=0) raw_text = raw_tweet_data['text'] corpus = [] for tweet in raw_text: corpus.append(tokenize(tweet)) w2v = word2vec() training_data = w2v.generate_training_data(settings, corpus)