コード例 #1
0
def createDelexData():
    """Main function of the script - loads delexical dictionary,
    goes through each dialogue and does:
    1) data normalization
    2) delexicalization
    3) addition of database pointer
    4) saves the delexicalized data
    """

    # create dictionary of delexicalied values that then we will search against, order matters here!
    dic = delexicalize.prepareSlotValuesIndependent()
    delex_data = {}

    fin1 = open('data/woz2/data.json')
    data = json.load(fin1)

    for dialogue_name in tqdm(data):
        if 'WOZ' not in dialogue_name:
            continue
        dialogue = data[dialogue_name]
        #print dialogue_name

        for idx, turn in enumerate(dialogue['log']):
            # normalization, split and delexicalization of the sentence
            sent = normalize(turn['text'])

            words = sent.split()
            sent = delexicalize.delexicalise(' '.join(words), dic)

            # changes to numbers only here
            digitpat = re.compile('\d+')
            sent = re.sub(digitpat, '[value_count]', sent)

            # delexicalized sentence added to the dialogue
            dialogue['log'][idx]['text'] = sent

            if idx % 2 == 1:  # if it's a system turn
                # add database pointer
                pointer_vector = addDBPointer(turn)

                #print pointer_vector
                dialogue['log'][idx -
                                1]['db_pointer'] = pointer_vector.tolist()

        delex_data[dialogue_name] = dialogue

    with open('data/delex.json', 'w') as outfile:
        json.dump(delex_data, outfile)

    return delex_data
コード例 #2
0
def createDelexData(sent, sent_act, bs, dic, turn, option):
    # normalization, split and delexicalization of the sentence
    sent = normalize(sent)
    words = sent.split()
    sent = delexicalize.delexicalise(' '.join(words), dic)
    # parsing reference number GIVEN belief state
    sent = delexicaliseReferenceNumber(sent, turn)
    # changes to numbers only here
    digitpat = re.compile('\d+')
    sent = re.sub(digitpat, '[value_count]', sent)
    if option == 'user':
        sent = fixDelex(sent, None, bs)
    if option == 'sys':
        sent = fixDelex(sent, sent_act, None)

    return sent.strip()
コード例 #3
0
def createDelexData():
    """Main function of the script - loads delexical dictionary,
    goes through each dialogue and does:
    1) data normalization
    2) delexicalization
    3) addition of database pointer
    4) saves the delexicalized data
    """
    # download the data
    loadData()

    # create dictionary of delexicalied values that then we will search against, order matters here!
    dic = delexicalize.prepareSlotValuesIndependent()
    delex_data = {}

    with open('data/multi-woz/data.json') as fin1:
        data = json.load(fin1)

    with open('data/multi-woz/dialogue_acts.json') as fin2:
        data2 = json.load(fin2)

    cnt = 10

    for dialogue_name in tqdm(data):
        dialogue = data[dialogue_name]
        # print(dialogue_name)

        idx_acts = 1

        for idx, turn in enumerate(dialogue['log']):
            # normalization, split and delexicalization of the sentence
            sent = normalize(turn['text'])

            words = sent.split()
            sent = delexicalize.delexicalise(' '.join(words), dic)

            # parsing reference number GIVEN belief state
            sent = delexicaliseReferenceNumber(sent, turn)

            # changes to numbers only here
            digitpat = re.compile('\d+')
            sent = re.sub(digitpat, '[value_count]', sent)

            # delexicalized sentence added to the dialogue
            dialogue['log'][idx]['text'] = sent

            if idx % 2 == 1:  # if it's a system turn
                # add database pointer
                pointer_vector = addDBPointer(turn)
                # add booking pointer
                pointer_vector = addBookingPointer(dialogue, turn,
                                                   pointer_vector)

                # print(pointer_vector)
                dialogue['log'][idx -
                                1]['db_pointer'] = pointer_vector.tolist()

            # FIXING delexicalization:
            dialogue = fixDelex(dialogue_name, dialogue, data2, idx, idx_acts)
            idx_acts += 1

        delex_data[dialogue_name] = dialogue

    with open('data/multi-woz/delex.json', 'w') as outfile:
        json.dump(delex_data, outfile)

    return delex_data