Example #1
0
def load_data_hmm():
    train_sequences = []
    test_sequences = []

    curr_sequence = []
    first = True
    for tag, tokens in get_utterances_from_file("data/swda_utterances.train"):
        if first: 
            cur_id = tag.split('/')[1].split('_')[0]
            first = False
        elif cur_id !=  tag.split('/')[1].split('_')[0]:
            cur_id = tag.split('/')[1].split('_')[0]
            train_sequences.append(curr_sequence)
            curr_sequence = []
        curr_sequence.append((tag, " ".join(tokens)))
    train_sequences.append(curr_sequence)
    curr_sequence = []
    first = True

    for tag, tokens in get_utterances_from_file("data/swda_utterances.test"):
        if first: 
            cur_id = tag.split('/')[1].split('_')[0]
            first = False
        elif cur_id !=  tag.split('/')[1].split('_')[0]:
            cur_id = tag.split('/')[1].split('_')[0]
            test_sequences.append(curr_sequence)
            curr_sequence = []
        curr_sequence.append((tag, " ".join(tokens)))
    test_sequences.append(curr_sequence)
    curr_sequence = []

    return train_sequences, test_sequences
Example #2
0
def load_data():
    train_utt = []
    train_Y = []
    test_utt = []
    test_Y = []
    for tag, tokens in get_utterances_from_file("data/swda_utterances.train"):
        train_utt.append(" ".join(tokens))
        train_Y.append(tag)
    for tag, tokens in get_utterances_from_file("data/swda_utterances.test"):
        test_utt.append(" ".join(tokens))
        test_Y.append(tag)

    return train_utt, train_Y, test_utt, test_Y
Example #3
0
def train_model(train_corpus_file, output_file_name, min_count=1, window=10, size=300, sample=1e-4, negative=5, workers=4, alpha=0.025,
                min_alpha = 0.001, epochs=10, w2v_intersect=None):
    utterance_tag_set = []
    utterances = dict()
    for tag, utt_tokens in get_utterances_from_file(train_corpus_file):
        utterance_tag_set.append(TaggedDocument(utt_tokens, [unicode(tag )]))
        utterances[unicode(tag)] = " ".join(utt_tokens)


    print "Training size:", len(utterances)

    model = Doc2Vec(min_count=min_count, window=window, size=size, sample=sample, negative=negative, workers=workers)
    model.build_vocab(utterance_tag_set)

    # use pretrained word2vec model to initialize word parameters
    if w2v_intersect:
        model.intersect_word2vec_format(w2v_intersect, binary=True)


    alpha_delta = (alpha - min_alpha) / epochs
    for epoch in range(epochs):
        print "Epoch: ", epoch
        random.shuffle(utterance_tag_set)
        model.alpha, model.min_alpha = alpha, alpha
        model.train(utterance_tag_set)
        alpha -= alpha_delta


    save_all_models(model, utterances, output_file_name)
Example #4
0
def load_utterances(fname):

    utterances = defaultdict(list)

    for tag, tokens in get_utterances_from_file(fname):
        # remove id from tag
        tag = tag.split("/")[0]
        utterances[tag].append(tokens)

    return utterances
Example #5
0
def get_swda_labeled_utterances():
    X_tokens = []
    Y_tags = []
    for tag, tokens in get_utterances_from_file("data/swda_file.txt"):
        X_tokens.append(" ".join(tokens))
        # remove id from tag
        tag = tag.split("/")[0]
        Y_tags.append(tag)

    return X_tokens, Y_tags
Example #6
0
def grab_data(path, model):
    dialogs = defaultdict(list)
    speaker_dict = {"A": 0, "B" : 1}
    for tag, utterance in get_utterances_from_file(path):
        transcript_speaker_id = tag.split('/')[1].split("_")
        # we might want to look up the vector in stead of inferring it?
        utterance_representation = np.append(model.infer_vector(utterance), speaker_dict[transcript_speaker_id[1]])
        dialogs[transcript_speaker_id[0]].append(utterance_representation)
    
    # we are using dummy (random) tags, as we train just to get embeddings (is this ok?) maybe we nee to re assign every time?
    return dialogs.values(), np.random.randint(2, size = len(dialogs.values()))