def trees(filename): with open(filename) as f: for t in ptb.parse(f): yield transform(t)
def process_stanford_sentiment_corpus(train_path, dev_path, test_path, pkl_path, unk_threshold, unk_token= '<UNK>', pad_token= '<PADDING>'): """ Input three paths for the PTB tree file of train/validate/test data unk_threshold: the frequency threshold below which the word is marked as unk_token preproces the data and save the pickle Return the pickle path """ # parse all the trees # and represent sentence as a list of words print "parsing trees.." with open(train_path, "r", "utf8") as train_f, \ open(dev_path, "r", "utf8") as dev_f, \ open(test_path, "r", "utf8") as test_f: #flattened subtrees for training data only train_sents, train_labels = zip(*[sub_sent for l in train_f for sub_sent in flattened_subtrees(parse(l))]) dev_sents, dev_labels = zip(*[flatten_tree(parse(l)) for l in dev_f]) test_sents, test_labels = zip(*[flatten_tree(parse(l)) for l in test_f]) print "Train sent size: %d\nDev sent size: %d\nTest sent size: %d" %( len(train_sents), len(dev_sents), len(test_sents) ) # gathering sentence length information sent_lens = [len(sent) for sent in train_sents] train_sent_max_len = max(sent_lens) print "train_sent_max_len: %d" %(train_sent_max_len) print "sent_mean_len: %f" %(np.mean(sent_lens)) print "sent_median_len: %f" %(np.median(sent_lens)) train_sent_max_len = max((len(sent) for sent in train_sents)) dev_sent_max_len = max((len(sent) for sent in dev_sents)) print "dev_sent_max_len: %d" %(dev_sent_max_len) test_sent_max_len = max((len(sent) for sent in test_sents)) print "test_sent_max_len: %d" %(test_sent_max_len) # preprocess number to DIGIT # somewhat memory inefficient # and also to lowercase print "convert digits..." regexp =re.compile(r'\d') train_sents = [regexp.sub('DIGIT', ' '.join(sent).lower()).split() for sent in train_sents] dev_sents = [regexp.sub('DIGIT', ' '.join(sent).lower()).split() for sent in dev_sents] test_sents = [regexp.sub('DIGIT', ' '.join(sent).lower()).split() for sent in test_sents] print "Collecting word frequency" # gather words in the train set # count their frequency word_freq = Counter((w for sent in train_sents for w in sent)) print "Building word and index mapping" # build the word-to-index dictionary and vice versa. # mark the infrequency word as unk_token frequent_words = [w for w in word_freq if word_freq[w] > unk_threshold] print "Vocab size: %d" %len(frequent_words) # add the two additional words frequent_words.append(unk_token) frequent_words.append(pad_token) word2index = OrderedDict([(w, i) for i, w in enumerate(frequent_words)]) index2word = OrderedDict([(w, i) for i, w in word2index.items()]) padding_index = word2index[pad_token] print "padding_index = %d" %(padding_index) print "Converting sentence to numpy array.." sent2array_padded = lambda sent, max_len: ( [word2index.get(word, word2index[unk_token]) for word in sent] + [padding_index] * (max_len - len(sent)) # add the paddings ) # sent2array_unpadded = lambda sent: [word2index.get(word, word2index[unk_token]) # for word in sent] # construct the sentence data, # each sentence is represented by the word indices def create_dataset(sents, labels, sent_max_len): x = np.array([sent2array_padded(sent, sent_max_len) for sent in sents], dtype = "int32") y =np.array(labels, dtype="int32") return x, y train_x, train_y = create_dataset(train_sents, train_labels, train_sent_max_len) dev_x, dev_y = create_dataset(dev_sents, dev_labels, dev_sent_max_len) test_x, test_y = create_dataset(test_sents, test_labels, test_sent_max_len) # load the pretrained embedding pkl_data = ( (train_x, train_y), (dev_x, dev_y), (test_x, test_y), word2index, index2word, np.load("data/stanfordSentimentTreebank/trees/pretrained.npy") ) print "dumping pickle to %s" %(pkl_path) pickle.dump(pkl_data, open(pkl_path, 'w')) debug = True if debug: print type(train_x) print train_x[0] print train_y[0] print dev_x[0] print dev_y[0] print test_x[0] print test_y[0] return pkl_data
def process_stanford_sentiment_corpus(train_path, dev_path, test_path, pkl_path, unk_threshold, unk_token='<UNK>', pad_token='<PADDING>'): """ Input three paths for the PTB tree file of train/validate/test data unk_threshold: the frequency threshold below which the word is marked as unk_token preproces the data and save the pickle Return the pickle path """ # parse all the trees # and represent sentence as a list of words print "parsing trees.." with open(train_path, "r", "utf8") as train_f, \ open(dev_path, "r", "utf8") as dev_f, \ open(test_path, "r", "utf8") as test_f: #flattened subtrees for training data only train_sents, train_labels = zip(*[ sub_sent for l in train_f for sub_sent in flattened_subtrees(parse(l)) ]) dev_sents, dev_labels = zip(*[flatten_tree(parse(l)) for l in dev_f]) test_sents, test_labels = zip( *[flatten_tree(parse(l)) for l in test_f]) print "Train sent size: %d\nDev sent size: %d\nTest sent size: %d" % ( len(train_sents), len(dev_sents), len(test_sents)) # gathering sentence length information sent_lens = [len(sent) for sent in train_sents] train_sent_max_len = max(sent_lens) print "train_sent_max_len: %d" % (train_sent_max_len) print "sent_mean_len: %f" % (np.mean(sent_lens)) print "sent_median_len: %f" % (np.median(sent_lens)) train_sent_max_len = max((len(sent) for sent in train_sents)) dev_sent_max_len = max((len(sent) for sent in dev_sents)) print "dev_sent_max_len: %d" % (dev_sent_max_len) test_sent_max_len = max((len(sent) for sent in test_sents)) print "test_sent_max_len: %d" % (test_sent_max_len) # preprocess number to DIGIT # somewhat memory inefficient # and also to lowercase print "convert digits..." regexp = re.compile(r'\d') train_sents = [ regexp.sub('DIGIT', ' '.join(sent).lower()).split() for sent in train_sents ] dev_sents = [ regexp.sub('DIGIT', ' '.join(sent).lower()).split() for sent in dev_sents ] test_sents = [ regexp.sub('DIGIT', ' '.join(sent).lower()).split() for sent in test_sents ] print "Collecting word frequency" # gather words in the train set # count their frequency word_freq = Counter((w for sent in train_sents for w in sent)) print "Building word and index mapping" # build the word-to-index dictionary and vice versa. # mark the infrequency word as unk_token frequent_words = [w for w in word_freq if word_freq[w] > unk_threshold] print "Vocab size: %d" % len(frequent_words) # add the two additional words frequent_words.append(unk_token) frequent_words.append(pad_token) word2index = OrderedDict([(w, i) for i, w in enumerate(frequent_words)]) index2word = OrderedDict([(w, i) for i, w in word2index.items()]) padding_index = word2index[pad_token] print "padding_index = %d" % (padding_index) print "Converting sentence to numpy array.." sent2array_padded = lambda sent, max_len: ( [word2index.get(word, word2index[unk_token]) for word in sent] + [padding_index] * (max_len - len(sent)) # add the paddings ) # sent2array_unpadded = lambda sent: [word2index.get(word, word2index[unk_token]) # for word in sent] # construct the sentence data, # each sentence is represented by the word indices def create_dataset(sents, labels, sent_max_len): x = np.array([sent2array_padded(sent, sent_max_len) for sent in sents], dtype="int32") y = np.array(labels, dtype="int32") return x, y train_x, train_y = create_dataset(train_sents, train_labels, train_sent_max_len) dev_x, dev_y = create_dataset(dev_sents, dev_labels, dev_sent_max_len) test_x, test_y = create_dataset(test_sents, test_labels, test_sent_max_len) # load the pretrained embedding pkl_data = ((train_x, train_y), (dev_x, dev_y), (test_x, test_y), word2index, index2word, np.load("data/stanfordSentimentTreebank/trees/pretrained.npy")) print "dumping pickle to %s" % (pkl_path) pickle.dump(pkl_data, open(pkl_path, 'w')) debug = True if debug: print type(train_x) print train_x[0] print train_y[0] print dev_x[0] print dev_y[0] print test_x[0] print test_y[0] return pkl_data
('PRP', 0, 1), ('ADVP', 1, 2), ('RB', 1, 2), ('VP', 2, 6), ('VBZ', 2, 3), ('NP-PRD', 3, 6), ('DT', 3, 4), ('NN', 4, 5), ('NN', 5, 6), (',', 6, 7), ('NP-SBJ', 7, 8), ('NNS', 7, 8), ('VP', 8, 9), ('VBP', 8, 9), ('SBAR', 9, 9), ('S', 9, 9), ('-NONE-', 9, 9), ('-NONE-', 9, 9), ('.', 9, 10) ] t = next(ptb.parse(tree)) u = next(ptb.parse(transformed_tree)) expectequal( set(spans) , set(ptb.all_spans(t)) ) ptb.remove_empty_elements(t) ptb.simplify_labels(t) t = ptb.add_root(t) expectequal( set(ptb.all_spans(t)) , set(ptb.all_spans(u)) )