Beispiel #1
0
def display_tokens_similarity():
    """ Goal: for each token of each sentence, computes which of the tokens of
    every other sentence is closer and shows it on shell.

    Method: in config.tokens_set we have computed the similarity of every pair
    of tokens, so we just loop over all of them and keep the most similar.

    Results:  a sample of the results can be seen at results_ranking/display_tokens_similarity/, and
    results are shown on shell
    """
    for video_id in range(config.first_video, config.last_video):
        print '\n\n ***** video ' + str(video_id)
        video_captions = load_video_captions(video_id)

        # for each token of each sentence we want to know wich token of every other sentence is closer
        for sentence1 in video_captions.sentences:
            try:
                print '\tsentence ' + str(
                    sentence1.id) + ' ' + sentence1.sentence
            except UnicodeEncodeError:
                print '[PRINTING ERROR] with printing sentence ' + str(
                    sentence1.id)
            for token1_id in sentence1.tokens_id_list:
                try:
                    print '\t\ttoken ' + config.tokens_set.tokens[
                        token1_id].token
                except UnicodeEncodeError:
                    print '[PRINTING ERROR] with printing token'
                for sentence2 in video_captions.sentences:
                    most_similar_token_in_sentence = (None, float('-inf'))
                    for token2_id in sentence2.tokens_id_list:
                        if (token1_id, token2_id
                            ) in config.tokens_set.tokens_similarities_closest:
                            similarity = config.tokens_set.tokens_similarities_closest[
                                (token1_id, token2_id)]
                            if similarity > most_similar_token_in_sentence[1]:
                                most_similar_token_in_sentence = (token2_id,
                                                                  similarity)
                    if most_similar_token_in_sentence[0] is not None:
                        try:
                            print '\t\t\tmost similar token in sentence ' + str(
                                sentence2.id
                            ) + ' is ' + config.tokens_set.tokens[
                                most_similar_token_in_sentence[
                                    0]].token + ' (' + str(
                                        most_similar_token_in_sentence[1]
                                    ) + ')\t\t\t(' + sentence2.sentence + ')'
                        except UnicodeEncodeError:
                            print '\t\t\t[PRINTING ERROR] with printing most similar token in sentence ' + str(
                                sentence2.id)
Beispiel #2
0
def combine_subjects_and_predicates():
    """ generates new training sentences by replacing tokens with synonyms
        "done adding new training sentences! Added 450757 captions" (without subject sense matching)
        "done adding new training sentences! Added 283313 captions" (with subject sense matching)
        "done adding new training sentences! Added 408177 captions" (with subject sense matching over the previously cleaned training sentences)
    """
    videos_new_captions = {}

    data_file = open(config.path_to_train_val_videodatainfo)
    data = json.load(data_file)

    training_sentences = {}
    for caption in data['sentences']:
        if caption['video_id'] in training_sentences:
            training_sentences[caption['video_id']].append(caption['caption'])
        else:
            training_sentences[caption['video_id']] = [caption['caption']]

    for video_id in range(config.first_video, config.last_video):
        video_captions = load_video_captions(video_id)

        parsed_captions, current_captions = parse_captions(
            video_captions, video_id, training_sentences)

        new_captions = []
        for caption in parsed_captions:
            # try to combine sentence subject with all other predicates
            subject = caption['subject']
            for caption2 in parsed_captions:
                number_matching = subject['singular'] == caption2['predicate'][
                    'singular']
                sense_matching, similarity = senses_match(
                    caption, caption2
                )  # TODO lpmayos: put on True to execute first experiment

                candidate_caption = caption['subject'][
                    'text'] + ' ' + caption2['predicate']['text']
                if number_matching and sense_matching and candidate_caption not in current_captions:
                    new_captions.append(candidate_caption)

        videos_new_captions[video_id] = list(
            set(new_captions + current_captions))

    create_training_sentences(videos_new_captions,
                              config.path_to_new_train_val_videodatainfo)
Beispiel #3
0
def create_video_captions():
    """ Goal: create pickle files containing video and tokens information, to
    speed up the experiments.

    Method: for each video in train_val_set try to load videoCaption object
    from pickle. If it dies not exist, create a VideoCaption object and save it
    as pickle, and add all its tokens to token_set, containing information of
    all the tokens of all sentences of all videos.
    If compute_similarities is True, it computes the similarity between all
    pairs of tokens extracted from the annotations and saves it to tokens_set.

    Results: pickle files for each video and for tokens_set are saved to
    config.pickle_folder
    """
    with open(config.path_to_train_val_videodatainfo) as data_file:
        i = 0
        data = json.load(data_file)
        for video_id in range(config.first_video, config.last_video):

            try:
                video_captions = load_video_captions(video_id)
            except (OSError, IOError):
                print '*** creating pickle of video ' + str(video_id)
                video_captions = VideoCaptions(data, 'video' + str(video_id))
                pickle.dump(
                    video_captions,
                    open(
                        config.pickle_folder + "/video_captions_" +
                        str(video_id) + ".pickle", "wb"))

            video_captions.compute_all_tokens_similarity()

            i += 1
            if i == 10:
                print 'saving small tokens_set_10'
                pickle.dump(config.tokens_set,
                            open('../pickle_small/tokens_set_10.pickle', "wb"))
            if i % 100 == 0:
                print 'iteration ' + str(
                    i) + ' -----------> dumping tokens set'
                pickle.dump(config.tokens_set,
                            open(config.tokens_set_to_load, "wb"))
        print 'iteration ' + str(i) + ' -----------> dumping tokens set'
        pickle.dump(config.tokens_set, open(config.tokens_set_to_load, "wb"))
Beispiel #4
0
def rank_captions_and_remove_worst(similarity_or_distance='similarity'):
    """ experiments 1, 2, 3, 4, 4symmetrical and 5
    """

    video_captions_ranking = {
    }  # dict with the form {video_id: [(sentence0, similarity), ..., (sentence19, similarity)], ...} where similarity indicates how similar the caption is to all other captions of the same video

    for video_id in range(config.first_video, config.last_video):
        video_captions = load_video_captions(video_id)
        video_captions_ranking[video_id] = compute_sentences_ranking(
            video_captions)

    # log sentences ranking
    log_sentences_ranking(video_captions_ranking, similarity_or_distance)

    # generate boxplot with sentences similarities and compute th2
    th2 = generate_boxplot(video_captions_ranking, similarity_or_distance)

    # generate barchart of how many sentences we have to remove from each video according to threshold th2
    generate_barchart(video_captions_ranking, th2, similarity_or_distance)

    # generate a new training set removing the detected annotations
    remove_training_sentences(video_captions_ranking, th2,
                              similarity_or_distance)
Beispiel #5
0
def association_strengthen():
    """ generates new training sentences by replacing those subjects with lower PMI with the ones with higher PMI,
    and replacing the verbs with the synset with higher PMI
    """

    synsets_dict = {}

    videos_new_captions = {}

    data_file = open(config.path_to_train_val_videodatainfo)
    data = json.load(data_file)

    training_sentences = {}
    for caption in data['sentences']:
        if caption['video_id'] in training_sentences:
            training_sentences[caption['video_id']].append(caption['caption'])
        else:
            training_sentences[caption['video_id']] = [caption['caption']]

    for video_id in range(config.first_video, config.last_video):
        print video_id

        video_captions = load_video_captions(video_id)
        parsed_captions, current_captions = parse_captions(
            video_captions, video_id, training_sentences)

        # create groups of captions whose subjects can be replaced by one of the subjects in the group (same number and meaning)
        groups = create_groups(parsed_captions)

        # for each group, find candidates and context, find candidate with higher pmi and replace candidates with higher-pmi candidate
        new_captions = []
        for i, group in enumerate(groups):

            # find candidates and context
            candidates_subject, context_predicates = find_candidates_and_contexts(
                group, parsed_captions)

            selected_candidate = select_candidate(candidates_subject,
                                                  context_predicates)
            selected_candidate_offset = min([
                int(a.features['start_string']) for a in parsed_captions[
                    selected_candidate['caption_id']]['subject']['subject']
            ])
            selected_candidate_start = selected_candidate[
                'start'] - selected_candidate_offset
            selected_candidate_end = selected_candidate[
                'end'] - selected_candidate_offset
            selected_candidate_text = selected_candidate['subject_text'][
                selected_candidate_start:selected_candidate_end]

            for caption_id in group:

                caption = parsed_captions[caption_id]

                # replace candidates root with lower pmi with selected candidate
                subject_root = [
                    a for a in caption['subject']['subject'] if a.subject_root
                ]
                if len(subject_root
                       ) > 0 and selected_candidate_text not in caption[
                           'subject']['text']:
                    subject_root[0].form = selected_candidate_text
                subject = ' '.join(
                    [a.form for a in caption['subject']['subject']])

                # find verb synset with higher PMI and replace current verb
                # replaceable_verbs = []
                predicate_verbs = [
                    a for a in caption['predicate']['predicate']
                    if a.pos.startswith('VB')
                ]  # and a.predicate_root]
                for predicate_verb in predicate_verbs:
                    verb_lemma = predicate_verb.lemma

                    # save in dict because it takes long time to compute
                    if verb_lemma in synsets_dict:
                        verb_synsets_unique = synsets_dict[verb_lemma]
                    else:
                        verb_synsets = wn.synsets(verb_lemma, pos=wn.VERB)
                        if len(verb_synsets) > 0:
                            verb_synset = verb_synsets[0]
                            verb_synsets_unique = []
                            for lemma in verb_synset.lemma_names():
                                lemma_synsets = wn.synsets(lemma, pos=wn.VERB)
                                if len(lemma_synsets) > 0:
                                    verb_synsets_unique.append(
                                        lemma_synsets[0])
                            verb_synsets_unique = list(
                                set(verb_synsets_unique))
                            synsets_dict[verb_lemma] = verb_synsets_unique

                    selected_verb = None
                    max_pmi = float('-inf')
                    for verb_synset in verb_synsets_unique:
                        try:
                            pmi = do_pmi(
                                [verb_synset.lemmas()[0].name() + ' VB'],
                                caption['predicate']['context'])
                            if pmi['normalized_pmi'] > max_pmi:
                                max_pmi = pmi['normalized_pmi']
                                selected_verb = verb_synset
                        except:
                            print '[ERROR] Error in do_pmi, skipping verb_synset'

                    if selected_verb and selected_verb.lemma_names(
                    )[0] != verb_lemma:

                        new_verb = None

                        # properly conjugate the selected verb and replace it in caption['predicate']['text']
                        try:
                            if 'finiteness' in predicate_verb.features and predicate_verb.features[
                                    'finiteness'] == 'PART':
                                new_verb = conjugate(
                                    selected_verb.lemma_names()[0],
                                    tense=(pattern.en.PAST +
                                           pattern.en.PARTICIPLE))
                            elif 'finiteness' in predicate_verb.features and predicate_verb.features[
                                    'finiteness'] == 'GER':
                                new_verb = conjugate(
                                    selected_verb.lemma_names()[0],
                                    tense=pattern.text.GERUND)
                            elif 'tense' in predicate_verb.features and predicate_verb.features[
                                    'tense'] == 'PAST':
                                new_verb = conjugate(
                                    selected_verb.lemma_names()[0], tense=PAST)
                            elif 'tense' in predicate_verb.features and predicate_verb.features[
                                    'tense'] == 'PRES':
                                if 'person' in predicate_verb.features and predicate_verb.features[
                                        'person'] == '3':
                                    new_verb = conjugate(
                                        selected_verb.lemma_names()[0],
                                        tense=PRESENT,
                                        number=SINGULAR)
                                else:
                                    new_verb = conjugate(
                                        selected_verb.lemma_names()[0],
                                        tense=PRESENT,
                                        number=PLURAL)
                        except KeyError, e:
                            print '[KeyError] I got a KeyError - reason "%s"' % str(
                                e)
                            print predicate_verb.features

                        if new_verb:
                            predicate_verb.plemma = selected_verb.lemma_names(
                            )[0]
                            predicate_verb.form = new_verb
                            predicate_verb.lemma = selected_verb.lemma_names(
                            )[0]
                            predicate_verb.columns[1] = new_verb
                            predicate_verb.columns[
                                2] = selected_verb.lemma_names()[0]
                            predicate_verb.columns[
                                3] = selected_verb.lemma_names()[0]

                predicate = ' '.join(
                    [a.form for a in caption['predicate']['predicate']])
                new_captions.append(subject + ' ' + predicate)

        videos_new_captions[video_id] = list(set(new_captions))