Exemple #1
0
def trees(filename):
    with open(filename) as f:
        for t in ptb.parse(f):
            yield transform(t)
Exemple #2
0
def process_stanford_sentiment_corpus(train_path, dev_path, test_path, 
                                      pkl_path, 
                                      unk_threshold, 
                                      unk_token= '<UNK>', 
                                      pad_token= '<PADDING>'):
    """
    Input three paths for the PTB tree file of train/validate/test data

    unk_threshold: the frequency threshold below which the word is marked as unk_token

    preproces the data and save the pickle
    
    Return the pickle path
    """
    # parse all the trees
    # and represent sentence as a list of words

    print "parsing trees.."
    with open(train_path, "r", "utf8") as train_f, \
         open(dev_path, "r", "utf8") as dev_f, \
         open(test_path, "r", "utf8") as test_f:
        #flattened subtrees for training data only
        train_sents, train_labels = zip(*[sub_sent
                                          for l in train_f
                                          for sub_sent in flattened_subtrees(parse(l))])
        dev_sents, dev_labels = zip(*[flatten_tree(parse(l))
                                      for l in dev_f])
        test_sents, test_labels = zip(*[flatten_tree(parse(l))
                                        for l in test_f])

    print "Train sent size: %d\nDev sent size: %d\nTest sent size: %d" %(
        len(train_sents), len(dev_sents), len(test_sents)
    )
    # gathering sentence length information
    sent_lens = [len(sent) 
                 for sent in train_sents]
    train_sent_max_len = max(sent_lens)
    print "train_sent_max_len: %d" %(train_sent_max_len)
    print "sent_mean_len: %f" %(np.mean(sent_lens))
    print "sent_median_len: %f" %(np.median(sent_lens))
    
    train_sent_max_len = max((len(sent) 
                              for sent in train_sents))
    dev_sent_max_len = max((len(sent) 
                              for sent in dev_sents))
    print "dev_sent_max_len: %d" %(dev_sent_max_len)

    test_sent_max_len = max((len(sent) 
                              for sent in test_sents))
    print "test_sent_max_len: %d" %(test_sent_max_len)
    
    # preprocess number to DIGIT
    # somewhat memory inefficient
    # and also to lowercase
    print "convert digits..."
    regexp =re.compile(r'\d')
    
    train_sents = [regexp.sub('DIGIT', ' '.join(sent).lower()).split()
                   for sent in train_sents]
    dev_sents = [regexp.sub('DIGIT', ' '.join(sent).lower()).split()
                 for sent in dev_sents]
    test_sents = [regexp.sub('DIGIT', ' '.join(sent).lower()).split()
                  for sent in test_sents]
    
    print "Collecting word frequency"
    # gather words in the train set
    # count their frequency
    word_freq = Counter((w 
                         for sent in train_sents
                         for w in sent))
    
    print "Building word and index mapping"
    # build the word-to-index dictionary and vice versa.
    # mark the infrequency word as unk_token
    frequent_words = [w
                      for w in word_freq 
                      if word_freq[w] > unk_threshold]

    print "Vocab size: %d" %len(frequent_words)
    
    # add the two additional words
    frequent_words.append(unk_token)
    frequent_words.append(pad_token)
    
    word2index = OrderedDict([(w, i)
                              for i, w in enumerate(frequent_words)])
    index2word = OrderedDict([(w, i)
                              for i, w in word2index.items()])
    
    
    padding_index = word2index[pad_token]
    print "padding_index = %d" %(padding_index)
    
    print "Converting sentence to numpy array.."
    
    sent2array_padded = lambda sent, max_len: (
        [word2index.get(word, word2index[unk_token])  
         for word in sent] +
        [padding_index] * (max_len - len(sent)) # add the paddings
    ) 
    
    # sent2array_unpadded = lambda sent: [word2index.get(word, word2index[unk_token])
    #                                     for word in sent]
    
    # construct the sentence data,
    # each sentence is represented by the word indices
    def create_dataset(sents, labels, sent_max_len):
        x = np.array([sent2array_padded(sent, sent_max_len)
                      for sent in sents],
                     dtype = "int32")
        
        y =np.array(labels, dtype="int32")

        return x, y

    
    train_x, train_y = create_dataset(train_sents, train_labels, train_sent_max_len)
    dev_x, dev_y = create_dataset(dev_sents, dev_labels, dev_sent_max_len)
    test_x, test_y = create_dataset(test_sents, test_labels, test_sent_max_len)
    
    
    # load the pretrained embedding
    pkl_data = (
        (train_x, train_y),
        (dev_x, dev_y),
        (test_x, test_y),
        word2index,
        index2word, 
        np.load("data/stanfordSentimentTreebank/trees/pretrained.npy")
    )
    
    print "dumping pickle to %s" %(pkl_path)
    pickle.dump(pkl_data, open(pkl_path, 'w'))

    debug = True
    if debug:
        print type(train_x)
        print train_x[0]
        print train_y[0]
        print dev_x[0]
        print dev_y[0]
        print test_x[0]
        print test_y[0]
        
    return pkl_data
Exemple #3
0
def process_stanford_sentiment_corpus(train_path,
                                      dev_path,
                                      test_path,
                                      pkl_path,
                                      unk_threshold,
                                      unk_token='<UNK>',
                                      pad_token='<PADDING>'):
    """
    Input three paths for the PTB tree file of train/validate/test data

    unk_threshold: the frequency threshold below which the word is marked as unk_token

    preproces the data and save the pickle
    
    Return the pickle path
    """
    # parse all the trees
    # and represent sentence as a list of words

    print "parsing trees.."
    with open(train_path, "r", "utf8") as train_f, \
         open(dev_path, "r", "utf8") as dev_f, \
         open(test_path, "r", "utf8") as test_f:
        #flattened subtrees for training data only
        train_sents, train_labels = zip(*[
            sub_sent for l in train_f
            for sub_sent in flattened_subtrees(parse(l))
        ])
        dev_sents, dev_labels = zip(*[flatten_tree(parse(l)) for l in dev_f])
        test_sents, test_labels = zip(
            *[flatten_tree(parse(l)) for l in test_f])

    print "Train sent size: %d\nDev sent size: %d\nTest sent size: %d" % (
        len(train_sents), len(dev_sents), len(test_sents))
    # gathering sentence length information
    sent_lens = [len(sent) for sent in train_sents]
    train_sent_max_len = max(sent_lens)
    print "train_sent_max_len: %d" % (train_sent_max_len)
    print "sent_mean_len: %f" % (np.mean(sent_lens))
    print "sent_median_len: %f" % (np.median(sent_lens))

    train_sent_max_len = max((len(sent) for sent in train_sents))
    dev_sent_max_len = max((len(sent) for sent in dev_sents))
    print "dev_sent_max_len: %d" % (dev_sent_max_len)

    test_sent_max_len = max((len(sent) for sent in test_sents))
    print "test_sent_max_len: %d" % (test_sent_max_len)

    # preprocess number to DIGIT
    # somewhat memory inefficient
    # and also to lowercase
    print "convert digits..."
    regexp = re.compile(r'\d')

    train_sents = [
        regexp.sub('DIGIT', ' '.join(sent).lower()).split()
        for sent in train_sents
    ]
    dev_sents = [
        regexp.sub('DIGIT', ' '.join(sent).lower()).split()
        for sent in dev_sents
    ]
    test_sents = [
        regexp.sub('DIGIT', ' '.join(sent).lower()).split()
        for sent in test_sents
    ]

    print "Collecting word frequency"
    # gather words in the train set
    # count their frequency
    word_freq = Counter((w for sent in train_sents for w in sent))

    print "Building word and index mapping"
    # build the word-to-index dictionary and vice versa.
    # mark the infrequency word as unk_token
    frequent_words = [w for w in word_freq if word_freq[w] > unk_threshold]

    print "Vocab size: %d" % len(frequent_words)

    # add the two additional words
    frequent_words.append(unk_token)
    frequent_words.append(pad_token)

    word2index = OrderedDict([(w, i) for i, w in enumerate(frequent_words)])
    index2word = OrderedDict([(w, i) for i, w in word2index.items()])

    padding_index = word2index[pad_token]
    print "padding_index = %d" % (padding_index)

    print "Converting sentence to numpy array.."

    sent2array_padded = lambda sent, max_len: (
        [word2index.get(word, word2index[unk_token])
         for word in sent] + [padding_index] *
        (max_len - len(sent))  # add the paddings
    )

    # sent2array_unpadded = lambda sent: [word2index.get(word, word2index[unk_token])
    #                                     for word in sent]

    # construct the sentence data,
    # each sentence is represented by the word indices
    def create_dataset(sents, labels, sent_max_len):
        x = np.array([sent2array_padded(sent, sent_max_len) for sent in sents],
                     dtype="int32")

        y = np.array(labels, dtype="int32")

        return x, y

    train_x, train_y = create_dataset(train_sents, train_labels,
                                      train_sent_max_len)
    dev_x, dev_y = create_dataset(dev_sents, dev_labels, dev_sent_max_len)
    test_x, test_y = create_dataset(test_sents, test_labels, test_sent_max_len)

    # load the pretrained embedding
    pkl_data = ((train_x, train_y), (dev_x, dev_y), (test_x, test_y),
                word2index, index2word,
                np.load("data/stanfordSentimentTreebank/trees/pretrained.npy"))

    print "dumping pickle to %s" % (pkl_path)
    pickle.dump(pkl_data, open(pkl_path, 'w'))

    debug = True
    if debug:
        print type(train_x)
        print train_x[0]
        print train_y[0]
        print dev_x[0]
        print dev_y[0]
        print test_x[0]
        print test_y[0]

    return pkl_data
Exemple #4
0
        ('PRP', 0, 1),
        ('ADVP', 1, 2),
        ('RB', 1, 2),
        ('VP', 2, 6),
        ('VBZ', 2, 3),
        ('NP-PRD', 3, 6),
        ('DT', 3, 4),
        ('NN', 4, 5),
        ('NN', 5, 6),
        (',', 6, 7),
        ('NP-SBJ', 7, 8),
        ('NNS', 7, 8),
        ('VP', 8, 9),
        ('VBP', 8, 9),
        ('SBAR', 9, 9),
        ('S', 9, 9),
        ('-NONE-', 9, 9),
        ('-NONE-', 9, 9),
        ('.', 9, 10)
    ]

    t = next(ptb.parse(tree))
    u = next(ptb.parse(transformed_tree))

    expectequal( set(spans) , set(ptb.all_spans(t)) )

    ptb.remove_empty_elements(t)
    ptb.simplify_labels(t)
    t = ptb.add_root(t)
    expectequal( set(ptb.all_spans(t)) , set(ptb.all_spans(u)) )