def load_data_hmm(): train_sequences = [] test_sequences = [] curr_sequence = [] first = True for tag, tokens in get_utterances_from_file("data/swda_utterances.train"): if first: cur_id = tag.split('/')[1].split('_')[0] first = False elif cur_id != tag.split('/')[1].split('_')[0]: cur_id = tag.split('/')[1].split('_')[0] train_sequences.append(curr_sequence) curr_sequence = [] curr_sequence.append((tag, " ".join(tokens))) train_sequences.append(curr_sequence) curr_sequence = [] first = True for tag, tokens in get_utterances_from_file("data/swda_utterances.test"): if first: cur_id = tag.split('/')[1].split('_')[0] first = False elif cur_id != tag.split('/')[1].split('_')[0]: cur_id = tag.split('/')[1].split('_')[0] test_sequences.append(curr_sequence) curr_sequence = [] curr_sequence.append((tag, " ".join(tokens))) test_sequences.append(curr_sequence) curr_sequence = [] return train_sequences, test_sequences
def load_data(): train_utt = [] train_Y = [] test_utt = [] test_Y = [] for tag, tokens in get_utterances_from_file("data/swda_utterances.train"): train_utt.append(" ".join(tokens)) train_Y.append(tag) for tag, tokens in get_utterances_from_file("data/swda_utterances.test"): test_utt.append(" ".join(tokens)) test_Y.append(tag) return train_utt, train_Y, test_utt, test_Y
def train_model(train_corpus_file, output_file_name, min_count=1, window=10, size=300, sample=1e-4, negative=5, workers=4, alpha=0.025, min_alpha = 0.001, epochs=10, w2v_intersect=None): utterance_tag_set = [] utterances = dict() for tag, utt_tokens in get_utterances_from_file(train_corpus_file): utterance_tag_set.append(TaggedDocument(utt_tokens, [unicode(tag )])) utterances[unicode(tag)] = " ".join(utt_tokens) print "Training size:", len(utterances) model = Doc2Vec(min_count=min_count, window=window, size=size, sample=sample, negative=negative, workers=workers) model.build_vocab(utterance_tag_set) # use pretrained word2vec model to initialize word parameters if w2v_intersect: model.intersect_word2vec_format(w2v_intersect, binary=True) alpha_delta = (alpha - min_alpha) / epochs for epoch in range(epochs): print "Epoch: ", epoch random.shuffle(utterance_tag_set) model.alpha, model.min_alpha = alpha, alpha model.train(utterance_tag_set) alpha -= alpha_delta save_all_models(model, utterances, output_file_name)
def load_utterances(fname): utterances = defaultdict(list) for tag, tokens in get_utterances_from_file(fname): # remove id from tag tag = tag.split("/")[0] utterances[tag].append(tokens) return utterances
def get_swda_labeled_utterances(): X_tokens = [] Y_tags = [] for tag, tokens in get_utterances_from_file("data/swda_file.txt"): X_tokens.append(" ".join(tokens)) # remove id from tag tag = tag.split("/")[0] Y_tags.append(tag) return X_tokens, Y_tags
def grab_data(path, model): dialogs = defaultdict(list) speaker_dict = {"A": 0, "B" : 1} for tag, utterance in get_utterances_from_file(path): transcript_speaker_id = tag.split('/')[1].split("_") # we might want to look up the vector in stead of inferring it? utterance_representation = np.append(model.infer_vector(utterance), speaker_dict[transcript_speaker_id[1]]) dialogs[transcript_speaker_id[0]].append(utterance_representation) # we are using dummy (random) tags, as we train just to get embeddings (is this ok?) maybe we nee to re assign every time? return dialogs.values(), np.random.randint(2, size = len(dialogs.values()))