def main(): _initWordDic() # parse the data using dataParser parser = DataParser() docs, summary = parser.parseFile() p_doc = Preparer(docs) p_summary = Preparer(summary, is_summary=True) p_doc.cutDocs() p_summary.cutDocs() docLens = p_doc.countDocs() sumLens = p_summary.countDocs() print(max(sumLens)) #sys.exit() p_doc.doc2Int() p_summary.doc2Int() # docs, docLens, summary, sumLens are the data data = list(zip(docs, summary, docLens, sumLens)) training_data = data[:1585] validation_data = data[:1835] testing_data = data[1835:] ''' FIXING THE DIMENSION ISSUES OF BATCHES sf_train = SF(training_data, CONFIG.BATCH_SIZE, is_training = True) sf_valid = SF(validation_data, CONFIG.BATCH_SIZE, is_training = False) for tup in sf_train.get_batch(): _, doc, summary, docLens, sumLens = tup doc_batch = _get_doc_batch(doc) summary_batch = _get_summary_batch(summary) label_batch = _get_label_batch(summary) docLens = np.array(docLens) summaryLens = np.array(sumLens) print (doc_batch[0]) print (summary_batch[0]) print (label_batch[0]) print (list(doc for doc in docLens)) print (list(doc for doc in summaryLens)) sys.exit()''' with tf.Graph().as_default(): initializer = tf.random_uniform_initializer(-1, 1) with tf.name_scope('Train'): with tf.variable_scope('Model', reuse=None, initializer=initializer): m = SummaryModel(is_training=True) with tf.name_scope('Valid'): with tf.variable_scope('Model', reuse=True, initializer=initializer): m_valid = SummaryModel(is_training=False) with tf.name_scope('Test'): with tf.variable_scope('Model', reuse=True, initializer=initializer): m_test = SummaryModel(is_training=False) init_op = tf.global_variables_initializer() config = tf.ConfigProto() config.gpu_options.visible_device_list = '7' sess = tf.Session(config=config) sess.run(init_op) for epoch in range(CONFIG.EPOCH): print('---------------running ' + str(epoch) + 'th epoch ----------------') run_epoch(sess, m, m_valid, training_data, validation_data)
temp = [] count = 0 for block in input_text: m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', block) temp.append([sent for sent in m.groupdict()['postcolon'].split('.') if sent]) input_text = temp for i in range(len(input_text)): for j in range(len(input_text[i])): tokens = re.sub(r"[^a-z0-9]+", " ", input_text[i][j].lower()).split() input_text[i][j] = tokens # doc: input_text[i], which is a list of sentences (list of words) from prepare_sentences import Preparer P = Preparer(DIM_RNN, input_text) docLengths = P.cutDocs() # docLengths are the list of unpadded lengths of each sentence # we add the label and length information to the data # prepare labels for each sentence for each doc sentence_labels = [] for i in range(len(labels)): # for each document, assign labels to all sentences num_sen = len(input_text[i]) sentence_labels.append([labels[i]]*num_sen) # IMPORTANT - format of data: # data[0]: list of documents (list of fixed-size sentences (list of words)) # data[1]: list of (list of labels for each sentence) for each doc # data[2]: list of (sentence lengths for each sentence) for each doc text_labels = [[input_text[i], sentence_labels[i], docLengths[i]] for i in range(len(input_text))] training_data = text_labels[:1585]
temp = [] count = 0 for block in input_text: m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', block) temp.append( [sent for sent in m.groupdict()['postcolon'].split('.') if sent]) input_text = temp for i in range(len(input_text)): for j in range(len(input_text[i])): tokens = re.sub(r"[^a-z0-9]+", " ", input_text[i][j].lower()).split() input_text[i][j] = tokens from prepare_sentences import Preparer P = Preparer(DIM_RNN, input_text) P.addStartStopWords(STARTWORD, STOPWORD) docLengths = P.cutDocs( ) # docLengths are the list of unpadded lengths of each sentence WORDCOUNTS = P.getTotalWordCount() print(WORDCOUNTS) # IMPORTANT - format of data: # data[i][0]: document (list of fixed-size sentences (list of words)) # data[i][1]: list of (sentence lengths for each sentence) for each doc text_labels = [[input_text[i], docLengths[i]] for i in range(len(input_text))] training_data = text_labels[:1585] validation_data = text_labels[1585:1835] testing_data = text_labels[1835:] from gensim.models import Word2Vec