def __init__(self, model_path, label_path): self.predictor = BertClassificationPredictor( model_path=model_path, label_path=label_path, # location for labels.csv file multi_label=False, model_type='bert', do_lower_case=False) self.preprocessor = TextPreprocessor()
def get_tf_idf_model(citations=None): if citations is None: citations = TextPreprocessor() citations.preprocess() documents = [ citation['title'] + ' \n' + citation['abstract'] for citation in list(citations.values()) ] bigram_vectorizer = CountVectorizer(ngram_range=(1, 2)) bigrams = bigram_vectorizer.fit_transform(documents) tfidf = TfidfTransformer().fit_transform(bigrams) return citations, bigram_vectorizer, tfidf
class SentimentAnalyzer(object): def __init__(self, model_path, label_path): self.predictor = BertClassificationPredictor( model_path=model_path, label_path=label_path, # location for labels.csv file multi_label=False, model_type='bert', do_lower_case=False) self.preprocessor = TextPreprocessor() def predict_sentiment(self, tweet): tweet = self.preprocessor.process(tweet) print(tweet) prediction = self.predictor.predict(tweet) print(prediction) for label, confidence in prediction: if label == "0" and confidence >= 0.7: return "Negative" if label == "4" and confidence >= 0.7: return "Positive" return "Neutral" def batch_predict_sentiment(self, tweets): processed_tweets = [] for tweet in tweets: processed_tweets.append(self.preprocessor.process(tweet)) predictions = self.predictor.predict_batch(processed_tweets) print(predictions) results = [] for prediction in predictions: label_to_prob = dict(prediction) if label_to_prob["0"] >= 0.7: results.append("Negative") elif label_to_prob["4"] >= 0.7: results.append("Positive") else: results.append("Neutral") return results
def predict(self, model): tqdm.pandas() print('preprocessing test data...') tp = TextPreprocessor() self.dftest['clean_text'] = self.dftest['text'].progress_apply(tp.pre_process_text) self.dftest['label'] = self.dftest['label'].replace(4,1) print('bag of words test data...') X_test = self.vect.transform(self.dftest['clean_text']) Y_test = self.dftest['label'] print('predict...') preds = self.evaluate_test(X_test, Y_test, model) return preds
def predict(self, model): tqdm.pandas() print('preprocessing test data...') tp = TextPreprocessor() self.dftest['clean_text'] = self.dftest['text'].progress_apply( tp.pre_process_text_no_stemming) print('word embeddings test data...') sequences = self.vect.texts_to_sequences( self.dftest['clean_text'].values) X_test = pad_sequences(sequences, maxlen=self.max_len) y_test = self.dftest['label'].values print('predict...') preds = model.predict(X_test) y_preds = [self.prob_to_sentiment_label(pred) for pred in preds] prob_map = ['negative', 'neutral', 'positive'] probs = [] for pred in preds: di = {} for i, prob in enumerate(pred): di[prob_map[i]] = prob probs.append(di) ##probs = ["{}:{}".format(prob_map[i[0]], prob) for i, prob in enumerate(preds)] self.dftest['pred'] = y_preds self.dftest['prob'] = probs submission = self.dftest[['text', 'label', 'pred', 'prob']] submission.to_csv('data/predictions_3_categories.csv') # print(classification_report(y_test, y_preds)) score, acc = model.evaluate(X_test, np_utils.to_categorical( self.dftest['label'].values), verbose=2, batch_size=128) print("score: %.2f" % (score)) print("acc: %.2f" % (acc)) return y_preds
def init_sentence_encoder(): global session, graph, text_preprocessor, encoding_ops, messages_plh graph = tf.Graph() text_preprocessor = TextPreprocessor() print(" Start initializing Tensorflow hub") os.environ["TFHUB_CACHE_DIR"] = '/tf_hub_cache' # Create and intialize the Tensorflow session with graph.as_default(): module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" embed = hub.Module(module_url) session = tf.Session(config=tf.ConfigProto(device_count={'GPU': 0})) session.run( [tf.global_variables_initializer(), tf.tables_initializer()]) messages_plh = tf.placeholder(dtype=tf.string, shape=[None]) encoding_ops = embed(messages_plh) print(" Model Sentence Encoder is loaded")
def main(hparams: HParams): ''' generate captions from images ''' device = torch.device(hparams.gpus if torch.cuda.is_available() else 'cpu') text_preprocessor = TextPreprocessor.load(hparams.text_preprocessor_path) transform = transforms.Compose([ transforms.Resize([hparams.crop_size, hparams.crop_size]), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # build model encoder = EncoderCNN(hparams.hidden_dim).eval() decoder = FactoredLSTM(hparams.embed_dim, text_preprocessor.vocab_size, hparams.hidden_dim, hparams.style_dim, hparams.num_layers, train=False, device=device) encoder = encoder.to(device) decoder = decoder.to(device) checkpoints = torch.load(hparams.checkpoint_path, map_location=device) encoder.load_state_dict(checkpoints['encoder']) decoder.load_state_dict(checkpoints['decoder']) img_names, img_list = load_images(hparams.img_dir, transform) for idx, (img_name, img) in enumerate(zip(img_names, img_list)): img = img.to(device) features = encoder(img) if hparams.decoder == 'greedy': output = decoder.sample_greedy(features, hparams.gen_max_len, hparams.mode, text_preprocessor.SOS_ID, text_preprocessor.EOS_ID) output = output[0].cpu().tolist() else: output = decoder.sample_beam(features, hparams.beam_width, hparams.gen_max_len, hparams.mode, text_preprocessor.SOS_ID, text_preprocessor.EOS_ID) output = output[1:output.index(text_preprocessor.EOS_ID)] # delete SOS and EOS caption = text_preprocessor.indice2tokens(output) print(img_names[idx]) print(' '.join(token for token in caption))
tfidf = TfidfTransformer().fit_transform(bigrams) return citations, bigram_vectorizer, tfidf def get_most_similar_documents(tfidf_matrix, vectorizer, query): query_tfidf = TfidfTransformer().fit_transform( vectorizer.transform([query]) ) document_similarities = linear_kernel(query_tfidf, tfidf_matrix).flatten() return document_similarities.argsort()[::-1] if __name__ == '__main__': citations = TextPreprocessor() input("Hit enter to continue...") count = 0 skipped_count = 0 for article, attributes in citations.items(): # Skip neighboring articles if len(attributes['neighbors']) == 0: skipped_count += 1 continue candidate_terms = get_candidates(citations, article) # Make features # Just printing them for now
def main(hparams: HParams): ''' setup training. ''' if torch.cuda.is_available() and not hparams.gpus: warnings.warn( 'WARNING: you have a CUDA device, so you should probably run with -gpus 0' ) device = torch.device(hparams.gpus if torch.cuda.is_available() else 'cpu') # data setup print(f"Loading vocabulary...") text_preprocessor = TextPreprocessor.load(hparams.preprocessor_path) transform = transforms.Compose([ transforms.Resize([hparams.img_size, hparams.img_size]), transforms.RandomCrop([hparams.crop_size, hparams.crop_size]), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # create dataloader print('Creating DataLoader...') normal_data_loader = get_image_caption_loader( hparams.img_dir, hparams.normal_caption_path, text_preprocessor, hparams.normal_batch_size, transform, shuffle=True, num_workers=hparams.num_workers, ) style_data_loader = get_caption_loader( hparams.style_caption_path, text_preprocessor, batch_size=hparams.style_batch_size, shuffle=True, num_workers=hparams.num_workers, ) if hparams.train_from: # loading checkpoint print('Loading checkpoint...') checkpoint = torch.load(hparams.train_from) else: normal_opt = Optim( hparams.optimizer, hparams.normal_lr, hparams.max_grad_norm, hparams.lr_decay, hparams.start_decay_at, ) style_opt = Optim( hparams.optimizer, hparams.style_lr, hparams.max_grad_norm, hparams.lr_decay, hparams.start_decay_at, ) print('Building model...') encoder = EncoderCNN(hparams.hidden_dim) decoder = FactoredLSTM(hparams.embed_dim, text_preprocessor.vocab_size, hparams.hidden_dim, hparams.style_dim, hparams.num_layers, hparams.random_init, hparams.dropout_ratio, train=True, device=device) encoder = encoder.to(device) decoder = decoder.to(device) # loss and optimizer criterion = nn.CrossEntropyLoss(ignore_index=text_preprocessor.PAD_ID) normal_params = list(encoder.parameters()) + list( decoder.default_parameters()) style_params = list(decoder.style_parameters()) normal_opt.set_parameters(normal_params) style_opt.set_parameters(style_params) if hparams.train_from: encoder.load_state_dict(checkpoint['encoder']) decoder.load_state_dict(checkpoint['decoder']) normal_opt.load_state_dict(checkpoint['normal_opt']) style_opt.load_state_dict(checkpoint['style_opt']) # traininig loop print('Start training...') for epoch in range(hparams.num_epoch): # result sum_normal_loss, sum_style_loss, sum_normal_ppl, sum_style_ppl = 0, 0, 0, 0 # normal caption for i, (images, in_captions, out_captions, lengths) in enumerate(normal_data_loader): images = images.to(device) in_captions = in_captions.to(device) out_captions = out_captions.contiguous().view(-1).to(device) # Forward, backward and optimize features = encoder(images) outputs = decoder(in_captions, features, mode='default') loss = criterion(outputs.view(-1, outputs.size(-1)), out_captions) encoder.zero_grad() decoder.zero_grad() loss.backward() normal_opt.step() # print log sum_normal_loss += loss.item() sum_normal_ppl += np.exp(loss.item()) if i % hparams.normal_log_step == 0: print( f'Epoch [{epoch}/{hparams.num_epoch}], Normal Step: [{i}/{len(normal_data_loader)}] ' f'Normal Loss: {loss.item():.4f}, Perplexity: {np.exp(loss.item()):5.4f}' ) # style caption for i, (in_captions, out_captions, lengths) in enumerate(style_data_loader): in_captions = in_captions.to(device) out_captions = out_captions.contiguous().view(-1).to(device) # Forward, backward and optimize outputs = decoder(in_captions, None, mode='style') loss = criterion(outputs.view(-1, outputs.size(-1)), out_captions) decoder.zero_grad() loss.backward() style_opt.step() sum_style_loss += loss.item() sum_style_ppl += np.exp(loss.item()) # print log if i % hparams.style_log_step == 0: print( f'Epoch [{epoch}/{hparams.num_epoch}], Style Step: [{i}/{len(style_data_loader)}] ' f'Style Loss: {loss.item():.4f}, Perplexity: {np.exp(loss.item()):5.4f}' ) model_params = { 'encoder': encoder.state_dict(), 'decoder': decoder.state_dict(), 'epoch': epoch, 'normal_opt': normal_opt.optimizer.state_dict(), 'style_opt': style_opt.optimizer.state_dict(), } avg_normal_loss = sum_normal_loss / len(normal_data_loader) avg_style_loss = sum_style_loss / len(style_data_loader) avg_normal_ppl = sum_normal_ppl / len(normal_data_loader) avg_style_ppl = sum_style_ppl / len(style_data_loader) print(f'Epoch [{epoch}/{hparams.num_epoch}] statistics') print( f'Normal Loss: {avg_normal_loss:.4f} Normal ppl: {avg_normal_ppl:5.4f} ' f'Style Loss: {avg_style_loss:.4f} Style ppl: {avg_style_ppl:5.4f}' ) torch.save( model_params, f'{hparams.model_path}/n-loss_{avg_normal_loss:.4f}_s-loss_{avg_style_loss:.4f}_' f'n-ppl_{avg_normal_ppl:5.4f}_s-ppl_{avg_style_ppl:5.4f}_epoch_{epoch}.pt' )
def __init__(self) -> None: self.model_id = time.strftime("%Y_%m_%d-%H_%M_%S-") + str(uuid.uuid4())[:8] self.update_frequency = 10 self.log_frequency = 1000 self.gamma = 0.9 self.use_cuda = False self.device = 'cpu' # load config & vocab with open("./vocab.txt") as f: self.word_vocab = f.read().split("\n") with open("config.yaml") as reader: self.config = yaml.safe_load(reader) self.max_vocab_size = len(self.word_vocab) self.word2id = {} for i, w in enumerate(self.word_vocab): self.word2id[w] = i self.EOS_id = self.word2id["</S>"] # Set the random seed manually for reproducibility. np.random.seed(self.config['general']['random_seed']) torch.manual_seed(self.config['general']['random_seed']) if torch.cuda.is_available(): if not self.config['general']['use_cuda']: print("WARNING: CUDA device detected but 'use_cuda: false' found in config.yaml") self.use_cuda = False else: torch.backends.cudnn.deterministic = True torch.cuda.manual_seed(self.config['general']['random_seed']) self.use_cuda = True self.device = 'cuda:0' else: self.use_cuda = False self.batch_size = self.config['training']['batch_size'] self.max_nb_steps_per_episode = self.config['training']['max_nb_steps_per_episode'] self.nb_epochs = self.config['training']['nb_epochs'] self.experiment_tag = self.config['checkpoint']['experiment_tag'] self.model_checkpoint_path = self.config['checkpoint']['model_checkpoint_path'] self.save_frequency = self.config['checkpoint']['save_frequency'] self.update_per_k_game_steps = self.config['general']['update_per_k_game_steps'] # update_frequency ? self.clip_grad_norm = self.config['training']['optimizer']['clip_grad_norm'] self._initialized = False self._epsiode_has_started = False self.current_episode = 0 self.best_avg_score_so_far = 0.0 # model_init self.model = CommandScorerModel(input_size=self.max_vocab_size, hidden_size=128, device=self.device, verbose=False) parameters = filter(lambda p: p.requires_grad, self.model.parameters()) self.optimizer = torch.optim.Adam(parameters, lr=self.config['training']['optimizer']['learning_rate']) self.model.to(self.device) # using checkpoint if self.config['checkpoint']['load_pretrained']: self.load_pretrained_model( self.model_checkpoint_path + '/' + self.config['checkpoint']['pretrained_experiment_tag'] + '.pt') if self.use_cuda: self.model.cuda() # tokenizer load self.nlp = spacy.load('en', disable=['ner', 'parser', 'tagger']) self.preposition_map = {"take": "from", "chop": "with", "slice": "with", "dice": "with", "cook": "with", "insert": "into", "put": "on"} self.single_word_verbs = set(["inventory", "look"]) self.mode = "test" # TODO self.rng = RandomState() self.text_processor = TextPreprocessor(self.nlp, self.device, self.word_vocab, self.single_word_verbs, self.EOS_id, self.preposition_map, self.word2id)
class CustomAgent: def __init__(self) -> None: self.model_id = time.strftime("%Y_%m_%d-%H_%M_%S-") + str(uuid.uuid4())[:8] self.update_frequency = 10 self.log_frequency = 1000 self.gamma = 0.9 self.use_cuda = False self.device = 'cpu' # load config & vocab with open("./vocab.txt") as f: self.word_vocab = f.read().split("\n") with open("config.yaml") as reader: self.config = yaml.safe_load(reader) self.max_vocab_size = len(self.word_vocab) self.word2id = {} for i, w in enumerate(self.word_vocab): self.word2id[w] = i self.EOS_id = self.word2id["</S>"] # Set the random seed manually for reproducibility. np.random.seed(self.config['general']['random_seed']) torch.manual_seed(self.config['general']['random_seed']) if torch.cuda.is_available(): if not self.config['general']['use_cuda']: print("WARNING: CUDA device detected but 'use_cuda: false' found in config.yaml") self.use_cuda = False else: torch.backends.cudnn.deterministic = True torch.cuda.manual_seed(self.config['general']['random_seed']) self.use_cuda = True self.device = 'cuda:0' else: self.use_cuda = False self.batch_size = self.config['training']['batch_size'] self.max_nb_steps_per_episode = self.config['training']['max_nb_steps_per_episode'] self.nb_epochs = self.config['training']['nb_epochs'] self.experiment_tag = self.config['checkpoint']['experiment_tag'] self.model_checkpoint_path = self.config['checkpoint']['model_checkpoint_path'] self.save_frequency = self.config['checkpoint']['save_frequency'] self.update_per_k_game_steps = self.config['general']['update_per_k_game_steps'] # update_frequency ? self.clip_grad_norm = self.config['training']['optimizer']['clip_grad_norm'] self._initialized = False self._epsiode_has_started = False self.current_episode = 0 self.best_avg_score_so_far = 0.0 # model_init self.model = CommandScorerModel(input_size=self.max_vocab_size, hidden_size=128, device=self.device, verbose=False) parameters = filter(lambda p: p.requires_grad, self.model.parameters()) self.optimizer = torch.optim.Adam(parameters, lr=self.config['training']['optimizer']['learning_rate']) self.model.to(self.device) # using checkpoint if self.config['checkpoint']['load_pretrained']: self.load_pretrained_model( self.model_checkpoint_path + '/' + self.config['checkpoint']['pretrained_experiment_tag'] + '.pt') if self.use_cuda: self.model.cuda() # tokenizer load self.nlp = spacy.load('en', disable=['ner', 'parser', 'tagger']) self.preposition_map = {"take": "from", "chop": "with", "slice": "with", "dice": "with", "cook": "with", "insert": "into", "put": "on"} self.single_word_verbs = set(["inventory", "look"]) self.mode = "test" # TODO self.rng = RandomState() self.text_processor = TextPreprocessor(self.nlp, self.device, self.word_vocab, self.single_word_verbs, self.EOS_id, self.preposition_map, self.word2id) def infos_to_request(self) -> EnvInfos: request_infos = EnvInfos() request_infos.description = True request_infos.inventory = True request_infos.entities = True request_infos.verbs = True request_infos.extras = ["recipe"] return request_infos def tokenize(self, text): text = preprocessing(text, tokenizer=self.nlp) word_ids = [get_word_id(t, self.word2id, self.max_vocab_size) for t in text] return word_ids def discount_rewards(self, last_values): returns, advantages = [], [] R = last_values.data for t in reversed(range(len(self.transitions))): rewards, _, _, values = self.transitions[t] R = rewards + self.gamma * R adv = R - values returns.append(R) advantages.append(adv) return returns[::-1], advantages[::-1] def select_additional_infos(self) -> EnvInfos: return EnvInfos(description=True, inventory=True, admissible_commands=True, has_won=True, extras=["recipe"], has_lost=True) def load_pretrained_model(self, load_from): print("loading model from %s\n" % (load_from)) try: if self.use_cuda: state_dict = torch.load(load_from) else: state_dict = torch.load(load_from, map_location='cpu') self.model.load_state_dict(state_dict) except: print("Failed to load checkpoint...") def finish(self) -> None: """ All games in the batch are finished. One can choose to save checkpoints, evaluate on validation set, or do parameter annealing here. """ # Game has finished (either win, lose, or exhausted all the given steps). self.final_rewards = np.array(self.scores[-1], dtype='float32') # batch dones = [] for d in self.dones: d = np.array([float(dd) for dd in d], dtype='float32') dones.append(d) dones = np.array(dones) step_used = 1.0 - dones self.step_used_before_done = np.sum(step_used, 0) # batch # save checkpoint if self.mode == "train" and self.current_episode % self.save_frequency == 0: avg_score = np.mean(self.final_rewards) if avg_score > self.best_avg_score_so_far: self.best_avg_score_so_far = avg_score save_to = self.model_checkpoint_path + '/' + self.experiment_tag + "_episode_" + str( self.current_episode) + ".pt" if not os.path.isdir(self.model_checkpoint_path): os.mkdir(self.model_checkpoint_path) torch.save(self.model.state_dict(), save_to) print("========= saved checkpoint =========") self.current_episode += 1 def train(self): self.mode = "train" self.stats = {"max": defaultdict(list), "mean": defaultdict(list)} self.transitions = [] self.model.reset_hidden(1) self.last_score = 0 self.no_train_step = 0 self.dones = [] self.scores = [] def eval(self): self.mode = "test" self.model.reset_hidden(1) def act(self, obs: List[str], scores: List[int], dones: List[bool], infos: Dict[str, List[Any]]) -> Optional[ List[str]]: input_tensor, _, commands_tensor = self.text_processor.get_game_step_info(obs, infos) outputs, indexes, values = self.model(input_tensor, commands_tensor) print('outputs:', outputs) print('indexes:', indexes[0]) print('values:', values) actions_per_batch = [] for cmds_i in range(self.batch_size): action = None try: action = infos["admissible_commands"][cmds_i][indexes[0][cmds_i]] except IndexError: # TODO torch.Size([3, max_seq_len, max_commands_number]) action = self.rng.choice(infos["admissible_commands"][cmds_i]) warnings.warn("Warning model choice padded array: %s" % ( str(infos["admissible_commands"][cmds_i]) + ' ' + str(len(infos["admissible_commands"][cmds_i])) + ' '+ str(indexes[0][cmds_i]),)) actions_per_batch.append(action) print('*' * 100) if self.mode == "eval": if all(dones): self.model.reset_hidden(1) return actions_per_batch self.no_train_step += 1 if self.transitions: reward = score - self.last_score # Reward is the gain/loss in score. self.last_score = score if infos["has_won"]: reward += 100 if infos["has_lost"]: reward -= 100 self.transitions[-1][0] = reward # Update reward information. if self.mode == "eval": if done: self.model.reset_hidden(1) return action if not self._epsiode_has_started: self.start_episode(obs, infos) if all(dones): self.end_episode(obs, scores, infos) return # Nothing to return. if self.current_step > 0: # append scores / dones from previous step into memory self.scores.append(scores) self.dones.append(dones) if all(dones): self.end_episode(obs, scores, infos) return # Nothing to return. return [self.rng.choice(cmds) for cmds in infos["admissible_commands"]]
words = list(set(data["word"].values)) n_words = len(words) print('Number of words:', n_words) # calculate tags tags = list(set(data["tag"].values)) n_tags = len(tags) print('Number of tags:', n_tags) print('Type of tags:', tags) # create output folder for x and y gfile.MakeDirs(os.path.dirname(args.output_x_path)) gfile.MakeDirs(os.path.dirname(args.output_y_path)) # preprocess text processor = TextPreprocessor(140) processor.fit(sentences_list) processor.labels = list(set(data["tag"].values)) X = processor.transform(sentences_list) # preprocess tags tag2idx = {t: i for i, t in enumerate(tags)} y = [[tag2idx[w[1]] for w in s] for s in sentences] y = pad_sequences(maxlen=140, sequences=y, padding="post", value=tag2idx["O"]) y = [to_categorical(i, num_classes=n_tags) for i in y] # export features and labels for training with gfile.GFile(args.output_x_path, 'w') as output_X: pickle.dump(X, output_X)
words = [] for token_sequence in sequences: words.extend(token_sequence) word_counts = dict(Counter(words).most_common(max_words)) most_common_words = list(word_counts.keys()) word_ids = list(range(len(most_common_words))) vocabulary = dict(zip(most_common_words, word_ids)) return vocabulary sentences = np.genfromtxt('./tickets_QIT.txt', dtype=str, delimiter='\n') prep = TextPreprocessor(sentences) prep = QITEmailBodyCleaner(prep) prep = Tokenizer(prep, language='italian') tokens = prep.preprocess() vocabulary = build_vocabulary(tokens) unknown_token_id = max(vocabulary.values()) + 1 prep = IntegerEncoder(prep, vocabulary, unknown_token_id) prep = WordContextPairsGenerator(prep, window_length=2) word_context_pairs = prep.preprocess() target_words = [tw for (tw, cw) in word_context_pairs] context_words = [cw for (tw, cw) in word_context_pairs] np.savetxt('target_words.txt', target_words, fmt='%d') np.savetxt('context_words.txt', context_words, fmt='%d')
from text_preprocessor import TextPreprocessor from qit_cleaner import QITEmailBodyCleaner from integer_encoder import IntegerEncoder from tokenizer import Tokenizer from padder import Padder from collections import Counter import numpy as np sentences = np.genfromtxt('../upsampled/x_QIT.txt', delimiter='\n', dtype=str) language = 'italian' max_words = None max_length = 25 # Text preprocessor with no functionalities whatsoever prep = TextPreprocessor(sentences) # Add decorator to clean email bodies prep = QITEmailBodyCleaner(prep) # Add tokenizer decorator prep = Tokenizer(prep, language) # Load vocabulary with open('vocabulary_wikipedia', 'r') as vocabulary_file: vocabulary = eval(vocabulary_file.read()) # Add integer encoding decorator unknown_token_id = max(vocabulary.values()) + 1 prep = IntegerEncoder(prep, vocabulary, unknown_token_id)