Exemple #1
0
    def test_single_word_vectorizer(self):
        text = 'Then, call someone you love'
        text_vector = set(['Then', 'call', 'someon', 'love'])
        self.assertEquals(text_vector, TextUtil.single_word_vectorizer(text))

        text = 'Then, call someone you love Then call love, pizza'
        text_vector = set(['Then', 'call', 'someon', 'love', 'pizza'])
        self.assertEquals(text_vector, TextUtil.single_word_vectorizer(text))
Exemple #2
0
    def transform(collection):
        output_collection = list()
        for entry in collection:
            output_collection.append(BlogEntry(TextUtil.to_lower_case(entry.title()), \
            entry.date(), entry.url(), TextUtil.to_lower_case(entry.text()), 
            entry.source, entry.crawl_url))

        return BlogEntryCollection(output_collection)
Exemple #3
0
 def test_eliminate_punctuation(self):
     self.assertEquals('PIZZApizza',
                       TextUtil.eliminate_punctuation("PIZZA.pizza"))
     self.assertEquals('PIZZApizza',
                       TextUtil.eliminate_punctuation("PIZZApizza"))
     self.assertEquals(
         'PIZZApizza',
         TextUtil.eliminate_punctuation(";;;;;.,,PIZZA;',;pizza"))
Exemple #4
0
    def from_json_object(json_object):
        title = TextUtil.unpack_list(json_object['title'])
        date_string = TextUtil.unpack_list(json_object['timestamp'])
        raw_text = TextUtil.unpack_list(json_object['raw_content'])
        url = TextUtil.unpack_list(json_object['url'])
        source = 'joy_the_baker'  #TextUtil.to_utf8(json_object['source'])
        crawl_url = url  #TextUtil.to_utf8(json_object['crawl_url'])

        return BlogEntry(title, date_string, url, raw_text, source, crawl_url)
Exemple #5
0
    def test_unpack_list(self):
        l1 = ['a', 'b']
        l2 = 'stringa'
        l3 = ['one', 'two', 'three', ['d', 'f']]
        l4 = [11, 25, 3, ""]
        l5 = []
        l6 = None

        test_lists = [l1, l2, l3, l4, l5, l6]

        self.assertEquals('a,b,stringa,one,two,three,d,f,11,25,3,,',
                          TextUtil.unpack_list(test_lists))
        self.assertEquals(l2 + ",", TextUtil.unpack_list(l2))
        self.assertEquals('3,', TextUtil.unpack_list(3))
def main():
    util = TextUtil('input.txt')
    train_config = CharacterModelConfig(util.vocab_size)
    train_config.hidden_depth = 2
    train_config.batch_size = 256
    eval_config = deepcopy(train_config)
    eval_config.batch_size = eval_config.seq_length = 1
    eval_config.keep_prob = 1.0

    print train_config

    tf.reset_default_graph()

    with tf.variable_scope('model', reuse=None):
        train_model = CharacterModel(train_config)
    with tf.variable_scope('model', reuse=True):
        eval_model = CharacterModel(eval_config)

    with tf.Session() as sess:
        loss_pp_iter = train(sess,
                             train_model,
                             util,
                             num_epochs=50,
                             eval_model=eval_model)
        for i in xrange(10):
            print '\n\nSample sentence %d' % (i + 1)
            print sample(sess, eval_model, util, 'The', length=60)
            print '\n'
def main():
    args = arg_parse()
    logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
    log = logging.getLogger(__name__)
    log.info(vars(args))
    #embedding = text.embedding.CustomEmbedding(args.embedding_file, elem_delim = ' ')
    #embedding.update_token_vectors('<unk>', nd.uniform(low = -0.05, high = 0.05, shape = 300))

    embedding = TextUtil(args.words_file, args.embedding_file)
    log.info(
        'embedding file loaded, # of words = {:d}, dimension = {:d}'.format(
            len(embedding), embedding.vec_len))

    data = {}
    for target in ['train', 'valid', 'test']:
        raw_data, label = load_raw_text(args.data_folder +
                                        '{:s}.txt'.format(target),
                                        is_test=args.is_test == 1)
        enc = onehot_enc(raw_data, embedding, args.sent_len, args.list_len,
                         args.mode)
        data[target] = {}
        data[target]['data'] = enc
        data[target]['label'] = label
        log.info('# of {:s} examples = {:d}'.format(target, len(enc)))


#    data = {}
#    data_tuple = load_encoded_csv('{:s}/raw_data.csv'.format(args.data_folder), args.num_workers)
#    targets = ['train', 'valid', 'test']
#    interval = [(0, 0.8), (0.8, 0.9), (0.9, 1)]
#    for i in range(len(targets)):
#        target = targets[i]
#        start = int(interval[i][0] * len(data_tuple))
#        end = int(interval[i][1] * len(data_tuple))
#        data[target] = {}
#        data[target]['label'] = [tup[1] for tup in data_tuple[start : end]]
#        data[target]['data'] = [tup[2] for tup in data_tuple[start : end]]
#        log.info('# of {:s} examples = {:d}'.format(target, len(data[target]['label'])))

    with open('{:s}/{:s}'.format(args.data_folder, args.data_out_file),
              'wb') as f:
        msgpack.dump(data, f)

    meta = {
        'embedding_file': args.embedding_file,
        'embedding_dim': embedding.vec_len,
        'vocab_size': len(embedding),
        'embedding': embedding.idx_to_vec,
    }
    with open('{:s}/{:s}'.format(args.data_folder, args.meta_out_file),
              'wb') as f:
        msgpack.dump(meta, f)
Exemple #8
0
 def transform(collection, foodwords_list, neighboring_words = 2):
     collection = ToLowerCase.transform(collection)
     collection = StripPunctuation.transform(collection)
     output_collection = list()
     for entry in collection:
         raw_words =  TextUtil.make_lowercase_word_vector(entry.text()) #, unique = False, stemming = True
         tot_words = len(raw_words)
         index = neighboring_words
         raw_content_words_list = list()
         while index <= (tot_words - neighboring_words):
             if TextUtil.list_contains(raw_words[index], foodwords_list):
                 sub_sentence = raw_words[(index-neighboring_words):(index+neighboring_words+1)]
                 
                 for word in sub_sentence:
                     raw_content_words_list.append(word)
                 
                 index += neighboring_words
             else:
                 index += 1
         
         output_collection.append(BlogEntry(entry.title(), entry.date(), entry.url(), set(raw_content_words_list)))
     return output_collection
Exemple #9
0
    def transform_to_word_matrix(collection):
        words_dict = {}
        words_matrix = {}
        word_counter = 0
        empty_array = [0] * collection.size()
        entry_titles = list()
        for entry_id, entry in enumerate(collection):
            raw_words = TextUtil.make_lowercase_word_vector(entry.text())
            entry_titles.append(entry.title())
            for w_id, word in enumerate(raw_words):
                if word not in words_dict:
                    words_dict[word] = word_counter
                    words_matrix[word_counter] = list()
                    #print 'before:', words_matrix[word_counter].shape, entry_id, words_matrix[word_counter]
                    words_matrix[word_counter].append(entry_id)
                    #print words_matrix[word_counter].shape, entry_id, words_matrix[word_counter]
                    word_counter += 1
                else:
                    words_matrix[words_dict[word]].append(entry_id)

        tot_words = len(words_dict.keys())
        wd = {}
        wwd = {}
        for word in words_dict:
            if len(words_matrix[words_dict[word]]) > 8:
                wd[words_dict[word]] = words_matrix[words_dict[word]]
                wwd[word] = words_dict[word]

        words_matrix = wd
        words_dict = wwd
        matrix = np.zeros((collection.size(), tot_words))
        for col_index in words_matrix.keys():
            for row_index in words_matrix[col_index]:
                #print row_index, col_index
                matrix[row_index, col_index] = 1

        print words_dict.keys()

        return matrix, entry_titles
Exemple #10
0
 def test_to_utf8(self):
     one_string = u'la vita \u00E8 bella'
     self.assertEquals('la vita è bella', TextUtil.to_utf8(one_string))
Exemple #11
0
 def test_make_lowercase_word_vector(self):
     text = 'Then, call someone you lovE then call love, pizza'
     text_vector = set(['then', 'call', 'someon', 'love', 'pizza'])
     self.assertEquals(text_vector,
                       TextUtil.make_lowercase_word_vector(text))
Exemple #12
0
 def test_to_lower_case(self):
     self.assertEquals('pizza', TextUtil.to_lower_case("PIZZA"))
     self.assertEquals('pizza', TextUtil.to_lower_case("PizZA"))
     self.assertEquals('pizza', TextUtil.to_lower_case("pizza"))
Exemple #13
0
 def eliminate_punctuation(in_string):
     return TextUtil.eliminate_punctuation(in_string)