def sampling(conf, positives, concept, corpus_train_padded): logger.info('Resampling training data...') sampled = [ sp_training.sample_for_individual_mention( pos, len(concept.names), config.getint('sample', 'neg_count')) for pos, men in positives ] name_order = [men for pos, men in positives] tr_data = sample.Data() tr_data.mentions = sample.sample_format_mentions(sampled, name_order) tr_data.x = sample.sample_format_x(sampled, corpus_train.padded, concept.padded, tr_data.mentions) tr_data.y = sample.sample_format_y(sampled) assert len(tr_data.x[0]) == len(tr_data.y) return tr_data
#format of corpus.padded: numpy, mentions, padded logger.info('New shape: {0}'.format(corpus.padded.shape)) # format data for cnn try: [tr_data,val_data,concept_order] = pickle.load(open('gitig_new_data.pickle','rb')) tr_data.y=np.array(tr_data.y) val_data.y=np.array(val_data.y) # reload the concept dict so that it is in the order when the data for predicion is created concept = concept_obj(config,dictionary,order=concept_order) logger.info('Using saved data: {0}'.format('gitig_new_data.pickle')) #import pdb;pdb.set_trace() except OSError: tr_data = sample.Data() val_data = sample.Data() for data, corpus in zip([tr_data, val_data],[corpus_train, corpus_dev]): data.x = sample.no_cangen_format_x(corpus.padded,concept.padded) data.mentions = sample.no_cangen_format_mentions(corpus.names,len(concept.names)) data.y = [[1] if men[0] in can and len(men)==1 else [0] for men in corpus.ids for can in concept.all_ids] data.y = [item for sublist in data.y for item in sublist] assert len(data.x[0]) == len(data.y) # save the data for cnn since it takes forever to generate # also save the concept dict order for faster prediction concept_order = uniq(concept.ids) data = [tr_data,val_data,concept_order] with open('gitig_new_data.pickle','wb') as f: pickle.dump(data,f,protocol=4) logger.info('Mentions and concepts saved.')
synonym_pairs.append((name, concept[j])) return synonym_pairs synonym_pairs = generate_synonym_pairs(dictionary, order=concept_order) questions = [question for question, answer in synonym_pairs] answers = [answer for question, answer in synonym_pairs] # FIXME: there may be positives as well # negatives = random.choices(concept.names,k=len(questions)) # this only works for python 3.6 + negatives = [random.choice(concept.names) for i in range(len(questions))] collection = [] for question, positive, negative in zip(questions, answers, negatives): collection.extend([(question, positive, 1), (question, negative, 0)]) random.shuffle(collection) tr_data = sample.Data() for sat, data in zip([collection], [tr_data]): x0 = [] x1 = [] y = [] for q, a, l in sat: x0.append([ vocabulary.get(tok.lower(), 1) for tok in nltk.word_tokenize(q) ]) x1.append([ vocabulary.get(tok.lower(), 1) for tok in nltk.word_tokenize(a) ]) y.append(l) x0 = pad_sequences(np.array(x0), padding='post',
assert not identical.all() except AssertionError: identical = c == collection[j] while identical.all(): j = j - 10 identical = c == collection[j] if order: answers.append(c) answers.append(collection[j]) labels.extend([1, 0]) else: answers.append(collection[j]) answers.append(c) labels.extend([0, 1]) tr_data = sample.Data() tr_data.x = [np.array(questions[:cutoff * 2]), np.array(answers[:cutoff * 2])] tr_data.y = np.array(labels[:cutoff * 2]) tr_data.mentions = [] for i, c in enumerate(collection[:cutoff]): tr_data.mentions.append((i * 2, i * 2 + 2, collection_names[i])) syn_val_data = sample.Data() syn_val_data.x = [ np.array(questions[cutoff * 2:]), np.array(answers[cutoff * 2:]) ] syn_val_data.y = np.array(labels[cutoff * 2:]) syn_val_data.mentions = [] for i, c in enumerate(collection[cutoff:]): syn_val_data.mentions.append((i * 2, i * 2 + 2, collection_names[i]))