def test_single_word_vectorizer(self): text = 'Then, call someone you love' text_vector = set(['Then', 'call', 'someon', 'love']) self.assertEquals(text_vector, TextUtil.single_word_vectorizer(text)) text = 'Then, call someone you love Then call love, pizza' text_vector = set(['Then', 'call', 'someon', 'love', 'pizza']) self.assertEquals(text_vector, TextUtil.single_word_vectorizer(text))
def transform(collection): output_collection = list() for entry in collection: output_collection.append(BlogEntry(TextUtil.to_lower_case(entry.title()), \ entry.date(), entry.url(), TextUtil.to_lower_case(entry.text()), entry.source, entry.crawl_url)) return BlogEntryCollection(output_collection)
def test_eliminate_punctuation(self): self.assertEquals('PIZZApizza', TextUtil.eliminate_punctuation("PIZZA.pizza")) self.assertEquals('PIZZApizza', TextUtil.eliminate_punctuation("PIZZApizza")) self.assertEquals( 'PIZZApizza', TextUtil.eliminate_punctuation(";;;;;.,,PIZZA;',;pizza"))
def from_json_object(json_object): title = TextUtil.unpack_list(json_object['title']) date_string = TextUtil.unpack_list(json_object['timestamp']) raw_text = TextUtil.unpack_list(json_object['raw_content']) url = TextUtil.unpack_list(json_object['url']) source = 'joy_the_baker' #TextUtil.to_utf8(json_object['source']) crawl_url = url #TextUtil.to_utf8(json_object['crawl_url']) return BlogEntry(title, date_string, url, raw_text, source, crawl_url)
def test_unpack_list(self): l1 = ['a', 'b'] l2 = 'stringa' l3 = ['one', 'two', 'three', ['d', 'f']] l4 = [11, 25, 3, ""] l5 = [] l6 = None test_lists = [l1, l2, l3, l4, l5, l6] self.assertEquals('a,b,stringa,one,two,three,d,f,11,25,3,,', TextUtil.unpack_list(test_lists)) self.assertEquals(l2 + ",", TextUtil.unpack_list(l2)) self.assertEquals('3,', TextUtil.unpack_list(3))
def main(): util = TextUtil('input.txt') train_config = CharacterModelConfig(util.vocab_size) train_config.hidden_depth = 2 train_config.batch_size = 256 eval_config = deepcopy(train_config) eval_config.batch_size = eval_config.seq_length = 1 eval_config.keep_prob = 1.0 print train_config tf.reset_default_graph() with tf.variable_scope('model', reuse=None): train_model = CharacterModel(train_config) with tf.variable_scope('model', reuse=True): eval_model = CharacterModel(eval_config) with tf.Session() as sess: loss_pp_iter = train(sess, train_model, util, num_epochs=50, eval_model=eval_model) for i in xrange(10): print '\n\nSample sentence %d' % (i + 1) print sample(sess, eval_model, util, 'The', length=60) print '\n'
def main(): args = arg_parse() logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') log = logging.getLogger(__name__) log.info(vars(args)) #embedding = text.embedding.CustomEmbedding(args.embedding_file, elem_delim = ' ') #embedding.update_token_vectors('<unk>', nd.uniform(low = -0.05, high = 0.05, shape = 300)) embedding = TextUtil(args.words_file, args.embedding_file) log.info( 'embedding file loaded, # of words = {:d}, dimension = {:d}'.format( len(embedding), embedding.vec_len)) data = {} for target in ['train', 'valid', 'test']: raw_data, label = load_raw_text(args.data_folder + '{:s}.txt'.format(target), is_test=args.is_test == 1) enc = onehot_enc(raw_data, embedding, args.sent_len, args.list_len, args.mode) data[target] = {} data[target]['data'] = enc data[target]['label'] = label log.info('# of {:s} examples = {:d}'.format(target, len(enc))) # data = {} # data_tuple = load_encoded_csv('{:s}/raw_data.csv'.format(args.data_folder), args.num_workers) # targets = ['train', 'valid', 'test'] # interval = [(0, 0.8), (0.8, 0.9), (0.9, 1)] # for i in range(len(targets)): # target = targets[i] # start = int(interval[i][0] * len(data_tuple)) # end = int(interval[i][1] * len(data_tuple)) # data[target] = {} # data[target]['label'] = [tup[1] for tup in data_tuple[start : end]] # data[target]['data'] = [tup[2] for tup in data_tuple[start : end]] # log.info('# of {:s} examples = {:d}'.format(target, len(data[target]['label']))) with open('{:s}/{:s}'.format(args.data_folder, args.data_out_file), 'wb') as f: msgpack.dump(data, f) meta = { 'embedding_file': args.embedding_file, 'embedding_dim': embedding.vec_len, 'vocab_size': len(embedding), 'embedding': embedding.idx_to_vec, } with open('{:s}/{:s}'.format(args.data_folder, args.meta_out_file), 'wb') as f: msgpack.dump(meta, f)
def transform(collection, foodwords_list, neighboring_words = 2): collection = ToLowerCase.transform(collection) collection = StripPunctuation.transform(collection) output_collection = list() for entry in collection: raw_words = TextUtil.make_lowercase_word_vector(entry.text()) #, unique = False, stemming = True tot_words = len(raw_words) index = neighboring_words raw_content_words_list = list() while index <= (tot_words - neighboring_words): if TextUtil.list_contains(raw_words[index], foodwords_list): sub_sentence = raw_words[(index-neighboring_words):(index+neighboring_words+1)] for word in sub_sentence: raw_content_words_list.append(word) index += neighboring_words else: index += 1 output_collection.append(BlogEntry(entry.title(), entry.date(), entry.url(), set(raw_content_words_list))) return output_collection
def transform_to_word_matrix(collection): words_dict = {} words_matrix = {} word_counter = 0 empty_array = [0] * collection.size() entry_titles = list() for entry_id, entry in enumerate(collection): raw_words = TextUtil.make_lowercase_word_vector(entry.text()) entry_titles.append(entry.title()) for w_id, word in enumerate(raw_words): if word not in words_dict: words_dict[word] = word_counter words_matrix[word_counter] = list() #print 'before:', words_matrix[word_counter].shape, entry_id, words_matrix[word_counter] words_matrix[word_counter].append(entry_id) #print words_matrix[word_counter].shape, entry_id, words_matrix[word_counter] word_counter += 1 else: words_matrix[words_dict[word]].append(entry_id) tot_words = len(words_dict.keys()) wd = {} wwd = {} for word in words_dict: if len(words_matrix[words_dict[word]]) > 8: wd[words_dict[word]] = words_matrix[words_dict[word]] wwd[word] = words_dict[word] words_matrix = wd words_dict = wwd matrix = np.zeros((collection.size(), tot_words)) for col_index in words_matrix.keys(): for row_index in words_matrix[col_index]: #print row_index, col_index matrix[row_index, col_index] = 1 print words_dict.keys() return matrix, entry_titles
def test_to_utf8(self): one_string = u'la vita \u00E8 bella' self.assertEquals('la vita è bella', TextUtil.to_utf8(one_string))
def test_make_lowercase_word_vector(self): text = 'Then, call someone you lovE then call love, pizza' text_vector = set(['then', 'call', 'someon', 'love', 'pizza']) self.assertEquals(text_vector, TextUtil.make_lowercase_word_vector(text))
def test_to_lower_case(self): self.assertEquals('pizza', TextUtil.to_lower_case("PIZZA")) self.assertEquals('pizza', TextUtil.to_lower_case("PizZA")) self.assertEquals('pizza', TextUtil.to_lower_case("pizza"))
def eliminate_punctuation(in_string): return TextUtil.eliminate_punctuation(in_string)