Esempio n. 1
0
def sampling(conf, positives, concept, corpus_train_padded):
    logger.info('Resampling training data...')
    sampled = [
        sp_training.sample_for_individual_mention(
            pos, len(concept.names), config.getint('sample', 'neg_count'))
        for pos, men in positives
    ]
    name_order = [men for pos, men in positives]
    tr_data = sample.Data()
    tr_data.mentions = sample.sample_format_mentions(sampled, name_order)
    tr_data.x = sample.sample_format_x(sampled, corpus_train.padded,
                                       concept.padded, tr_data.mentions)
    tr_data.y = sample.sample_format_y(sampled)
    assert len(tr_data.x[0]) == len(tr_data.y)
    return tr_data
Esempio n. 2
0
    #format of corpus.padded: numpy, mentions, padded
    logger.info('New shape: {0}'.format(corpus.padded.shape))


# format data for cnn
try:
    [tr_data,val_data,concept_order] = pickle.load(open('gitig_new_data.pickle','rb'))
    tr_data.y=np.array(tr_data.y)
    val_data.y=np.array(val_data.y)
    # reload the concept dict so that it is in the order when the data for predicion is created
    concept = concept_obj(config,dictionary,order=concept_order)
    logger.info('Using saved data: {0}'.format('gitig_new_data.pickle'))
    #import pdb;pdb.set_trace()

except OSError:
    tr_data = sample.Data()
    val_data = sample.Data()
    for data, corpus in zip([tr_data, val_data],[corpus_train, corpus_dev]):
        data.x = sample.no_cangen_format_x(corpus.padded,concept.padded)
        data.mentions = sample.no_cangen_format_mentions(corpus.names,len(concept.names))
        data.y = [[1] if men[0] in can and len(men)==1 else [0] for men in corpus.ids for can in concept.all_ids]
        data.y = [item for sublist in data.y for item in sublist]
        assert len(data.x[0]) == len(data.y)
    
    # save the data for cnn since it takes forever to generate
    # also save the concept dict order for faster prediction
    concept_order = uniq(concept.ids)
    data = [tr_data,val_data,concept_order]
    with open('gitig_new_data.pickle','wb') as f:
        pickle.dump(data,f,protocol=4)
    logger.info('Mentions and concepts saved.')
                    synonym_pairs.append((name, concept[j]))

        return synonym_pairs

    synonym_pairs = generate_synonym_pairs(dictionary, order=concept_order)
    questions = [question for question, answer in synonym_pairs]
    answers = [answer for question, answer in synonym_pairs]
    # FIXME: there may be positives as well
    # negatives = random.choices(concept.names,k=len(questions)) # this only works for python 3.6 +
    negatives = [random.choice(concept.names) for i in range(len(questions))]

    collection = []
    for question, positive, negative in zip(questions, answers, negatives):
        collection.extend([(question, positive, 1), (question, negative, 0)])
    random.shuffle(collection)
    tr_data = sample.Data()

    for sat, data in zip([collection], [tr_data]):
        x0 = []
        x1 = []
        y = []
        for q, a, l in sat:
            x0.append([
                vocabulary.get(tok.lower(), 1) for tok in nltk.word_tokenize(q)
            ])
            x1.append([
                vocabulary.get(tok.lower(), 1) for tok in nltk.word_tokenize(a)
            ])
            y.append(l)
        x0 = pad_sequences(np.array(x0),
                           padding='post',
Esempio n. 4
0
        assert not identical.all()
    except AssertionError:
        identical = c == collection[j]
        while identical.all():
            j = j - 10
            identical = c == collection[j]
    if order:
        answers.append(c)
        answers.append(collection[j])
        labels.extend([1, 0])
    else:
        answers.append(collection[j])
        answers.append(c)
        labels.extend([0, 1])

tr_data = sample.Data()
tr_data.x = [np.array(questions[:cutoff * 2]), np.array(answers[:cutoff * 2])]
tr_data.y = np.array(labels[:cutoff * 2])
tr_data.mentions = []
for i, c in enumerate(collection[:cutoff]):
    tr_data.mentions.append((i * 2, i * 2 + 2, collection_names[i]))

syn_val_data = sample.Data()
syn_val_data.x = [
    np.array(questions[cutoff * 2:]),
    np.array(answers[cutoff * 2:])
]
syn_val_data.y = np.array(labels[cutoff * 2:])
syn_val_data.mentions = []
for i, c in enumerate(collection[cutoff:]):
    syn_val_data.mentions.append((i * 2, i * 2 + 2, collection_names[i]))