def process_input_text(file_text, id_name): global KEY (meta, main) = preprocess.split_text(file_text) if not meta: print "ERROR IN SPLITTING MAIN AND META" return if not main: print "ERROR IN SPLITTING MAIN AND META" return file_text = re.sub(NEWLINE, " ", main) if DEBUG: print ("processing text", main) print ("") d = answr_dict() if not KEY: make_key() grammar = r""" NP: {<RB|PP\$>?<JJ>*<NN>+<POS>?} NP: {<RB|PP\$>?<JJ>*<NN>+<NNS>*} {<NNP>+} {<RB|PP\$>?<JJ>*<NNS>*<POS>?} """ # sents = map(pos_tag, map(word_tokenize, [s for s in sent_tokenize(file_text.lower())])) # cp = RegexpParser(grammar) # for s in sents: # print cp.parse(s) weapons = get_weapon(file_text, d) print weapons weapon = weapons[0][0] print id_name print "C", KEY[id_name], "\n", "D", weapon print # perpindiv = get_perp_indiv(file_text, d) perpindiv = "-" # perporg = get_perp_org(file_text, d) perporg = "-" # targets = get_target(file_text, d) # target = targets[0][0] target = "-" # victims = get_victim(file_text, d) # victim = victims[0][0] victim = "-" incident_type = incident_predictor.get_predicted_event(main) print_out(id_name, incident_type, weapon, perpindiv, perporg, target, victim)
def process_input_text(file_text,id_name): # remove the \n from in between the lines (meta,main) = preprocess.split_text(file_text) if (not meta): print "ERROR IN SPLITTING MAIN AND META" return if(not main): print "ERROR IN SPLITTING MAIN AND META" return #print proc_meta(meta) temp_victim_list = [] final_victim_set =set([]) temp_target_list = [] final_target_set = set([]) temp_perpi_list = [] final_perpi_set = set([]) file_text = re.sub(NEWLINE," ",main) file_text_list = file_text.split('\n') if(DEBUG): print ("processing text",main) print ("") # pass file text instead of main in infoextract2.py incident_type = incident_predictor.get_predicted_event(main) # TODO NER CALL A FUNCTION THAT returns NER DICT ner_tagged_text = process_ner.java_ner_tagger(file_text) if (ner_tagged_text): ner_tagged_text.strip() if(ner_tagged_text): ner_dict = process_ner.get_entities() if(ner_dict): print ner_dict # open file containing victim patterns text = utility.f_read('victim_out_patterns_regex2') victim_patt_lines = text.split('\n') text = utility.f_read('target_out_patterns_regex2') # has only back patt target_patt_lines = text.split('\n') text = utility.f_read('perp_out_patterns_regex2') # has both front and back patterns perp_patt_lines = text.split('\n') # ALGO read one line at a time .. if it matches one of the patterns then parse that line and do ur thing # READ EACH LINE IN THE from input file for line in file_text_list: line = line.strip() if(not line): continue # split each line into several sentences sents = utility.sent_splitter(line) for sent in sents: #print "processing line",line # make sure no consecutive white spaces in ur line sent = sent.strip() # TODO remove 's and `` from sentence remove `` as well ? sent = re.sub(SPATT,"",sent) input_line = re.sub(COLL_SPACES,SPACES_REPL,sent) temp_victim_list = pattern_extractor.get_victims(input_line,victim_patt_lines) if temp_victim_list: for victim in temp_victim_list: victim = victim.strip() if victim: final_victim_set.add(victim) # TARGET LIST temp_target_list = pattern_extractor.get_targets(input_line,target_patt_lines) if temp_target_list: for target in temp_target_list: target = target.strip() if target: final_target_set.add(target) # PERPI LIST temp_perpi_list = pattern_extractor.get_perpi(input_line,perp_patt_lines) if temp_perpi_list: for perp in temp_perpi_list: perp = perp.strip() if perp: final_perpi_set.add(perp) # now use algorithms to clean this list and to remove redundant stuff # get target_list # a victim cannot be an org or location ?? has to be a person #subset removal v_new_list = list(final_victim_set) v_new_list = utility.remove_subsets(v_new_list) print "after subset removal" print v_new_list v_new_list = utility.remove_syn(v_new_list) print "after duplicate removal for ",id_name print v_new_list v_new_list = utility.rmv_flagged_np(v_new_list,'victim')# e.g headquarters print "after removing flag words for ",id_name print v_new_list v_new_list = utility.first_word_flag(v_new_list,'victim')# e.g suspects print "after one removing first word flags for ",id_name print v_new_list v_new_list = utility.first_word_rmv(v_new_list)# e.g COLONEL REPORTER print "after removing first title words like COLONEL etc ",id_name print v_new_list v_new_list = utility.one_word_cleaner(v_new_list) print "after one word and digit removal for ",id_name print v_new_list v_new_list = utility.victim_hacks(v_new_list)# e.g hacks print "after adding some hacks make unique",id_name print v_new_list print "###########################" # a target cannot be a a person or location t_new_list = list(final_target_set) t_new_list = utility.remove_subsets(t_new_list) print "after subset removal" print t_new_list t_new_list = utility.remove_syn(t_new_list) print "after duplicate removal" print t_new_list t_new_list = utility.rmv_flagged_np(t_new_list,'target')# e.g headquarters print "after removing flag words for ",id_name print t_new_list t_new_list = utility.first_word_flag(t_new_list,'target')# e.g suspects print "after one removing first word flags for ",id_name print t_new_list t_new_list = utility.one_word_cleaner(t_new_list) print "###Final after one word removal for ",id_name print t_new_list #print "###########################" # NER HINT a perpetrator cannot be a LOCATION or an org ?? p_new_list = list(final_perpi_set) p_new_list = utility.remove_subsets(p_new_list) print "after subset removal" print p_new_list p_new_list = utility.remove_syn(p_new_list) print "after duplicate removal" print p_new_list p_new_list = utility.rmv_flagged_np(p_new_list,'perp')# e.g headquarters print "after removing flag words for ",id_name print p_new_list p_new_list = utility.first_word_flag(p_new_list,'perp')# e.g suspects print "after one removing first word flags for ",id_name print p_new_list p_new_list = utility.one_word_cleaner(p_new_list) print " Final after one word and digit removal for ",id_name print p_new_list #print "###########################" #dict_out = matching.match(parsed_text) #print ("") print_outf(id_name,incident_type,[],p_new_list,[],t_new_list,v_new_list)
def main(): parser = argparse.ArgumentParser( description='PyTorch PennTreeBank RNN/LSTM Language Model') parser.add_argument('--data', type=str, default='../data/', help='location of the data corpus') parser.add_argument('--presaved', action='store_true', help='use presaved data') parser.add_argument('--glovedata', type=str, default='../data/', help='location of the pretrained glove embeddings') parser.add_argument('--din', type=int, default=30, help='length of LSTM') parser.add_argument('--demb', type=int, default=300, help='size of word embeddings') parser.add_argument('--dhid', type=int, default=300, help='number of hidden units per layer') parser.add_argument('--dlin', type=int, default=500, help='number linear transformation nodes') parser.add_argument('--dout', type=int, default=2, help='number of output classes') parser.add_argument('--nlayers', type=int, default=1, help='number of layers') parser.add_argument('--lr', type=float, default=0.001, help='initial learning rate') parser.add_argument('--wd', type=float, default=0.0, help='adam l2 weight decay') parser.add_argument('--clip', type=float, default=0.25, help='gradient clipping') parser.add_argument('--embinit', type=str, default='random', help='embedding weight initialization type') parser.add_argument('--decinit', type=str, default='random', help='decoder weight initialization type') parser.add_argument('--hidinit', type=str, default='random', help='recurrent hidden weight initialization type') parser.add_argument('--dropout', type=float, default=0.0, help='dropout applied to layers (0 = no dropout)') parser.add_argument('--rnn', type=str, default='lstm', help='lstm or gru') parser.add_argument('--epochs', type=int, default=40, help='upper epoch limit') parser.add_argument('--batchsize', type=int, default=2000, metavar='N', help='batch size') parser.add_argument('--seed', type=int, default=3, help='random seed') parser.add_argument('--vocabsize', type=int, default=200000, help='random seed') parser.add_argument('--optimizer', action='store_true', help='use ADAM optimizer') parser.add_argument('--reweight', action='store_true', help='reweight loss function') parser.add_argument('--clean', action='store_true', help='clean text') parser.add_argument('--rm_stops', action='store_true', help='remove stop words') parser.add_argument('--bidir', action='store_false', help='bidirectional') parser.add_argument('--freezeemb', action='store_false', help='freezes embeddings') parser.add_argument('--cuda', action='store_true', help='use CUDA') parser.add_argument('--loginterval', type=int, default=100, metavar='N', help='report interval') parser.add_argument('--save', type=str, default='', help='path to save the final model') args = parser.parse_args() pipe = None corpus = TacoText(args.vocabsize, lower=True, vocab_pipe=pipe) train_data = pd.read_csv('../data/train_data_shuffle.csv') valid_data = pd.read_csv('../data/val_data_shuffle.csv') train_data = train_data.fillna(' ') valid_data = valid_data.fillna(' ') if args.reweight: print('Downsampling') #downsample pos_valid = valid_data[valid_data['is_duplicate'] == 1] neg_valid = valid_data[valid_data['is_duplicate'] == 0] p = 0.19 pl = len(pos_valid) tl = len(pos_valid) + len(neg_valid) val = int(pl - (pl - p * tl) / ((1 - p))) pos_valid = pos_valid.iloc[:int(val)] valid_data = pd.concat([pos_valid, neg_valid]) print('Splitting Train') q1 = list(train_data['question1'].map(str)) q2 = list(train_data['question2'].map(str)) y = list(train_data['is_duplicate']) print('Splitting Valid') q1_val = list(valid_data['question1'].map(str)) q2_val = list(valid_data['question2'].map(str)) y_val = list(valid_data['is_duplicate']) train_feat = pd.read_csv('../data/train_features_all_norm.csv') val_feat = train_feat.iloc[valid_data['id']].values train_feat = train_feat.iloc[train_data['id']].values print('Splitting Data') if args.clean: print('Cleaning Data') stops = None if args.rm_stops: stops = stops = set(stopwords.words("english")) q1 = [split_text(x, stops) for x in q1] q2 = [split_text(x, stops) for x in q2] q1_val = [split_text(x, stops) for x in q1_val] q2_val = [split_text(x, stops) for x in q2_val] else: q1 = [x.lower().split() for x in q1] q2 = [x.lower().split() for x in q2] q1_val = [x.lower().split() for x in q1_val] q2_val = [x.lower().split() for x in q2_val] print('Downsample Weight: ', np.mean(y_val)) corpus.gen_vocab(q1 + q2 + q2_val + q1_val) n_feat = train_feat.shape[1] d_in = args.din feat_max = int(np.max([n_feat, d_in])) X = torch.Tensor(len(train_data), 1, 3, feat_max) X[:, 0, 0, :] = torch.from_numpy(corpus.pad_numericalize(q1, feat_max)).long() X[:, 0, 1, :] = torch.from_numpy(corpus.pad_numericalize(q2, feat_max)).long() X[:, 0, 2, :n_feat] = torch.from_numpy(np.array(train_feat)) y = torch.from_numpy(np.array(y)).long() X_val = torch.Tensor(len(valid_data), 1, 3, feat_max) X_val[:, 0, 0, :] = torch.from_numpy(corpus.pad_numericalize(q1_val, feat_max)).long() X_val[:, 0, 1, :] = torch.from_numpy(corpus.pad_numericalize(q2_val, feat_max)).long() X_val[:, 0, 2, :n_feat] = torch.from_numpy(np.array(val_feat)) y_val = torch.from_numpy(np.array(y_val)).long() if args.cuda: X, y = X.cuda(), y.cuda() X_val, y_val = X_val.cuda(), y_val.cuda() print('Generating Data Loaders') #X.size len(train_data),1,2,fix_length train_dataset = TensorDataset(X, y) train_loader = DataLoader(train_dataset, batch_size=args.batchsize, shuffle=True) valid_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=args.batchsize, shuffle=False) num_train = len(X) del X, y, X_val, y_val, train_feat, val_feat, q1, q2, q1_val, q2_val ntokens = len(corpus) glove_embeddings = None if args.embinit == 'glove': assert args.demb in (50, 100, 200, 300) glove_embeddings = get_glove_embeddings(args.glovedata, corpus.dictionary.word2idx, ntokens, args.demb) model = ConvRNNLSTMFeat(args.din, args.dhid, args.dout, args.demb, args.dlin, args.vocabsize, args.dropout, args.embinit, args.hidinit, args.decinit, glove_embeddings, args.cuda, args.rnn, args.bidir, n_feat) if args.cuda: model.cuda() if args.reweight: w_tensor = torch.Tensor([1.309028344, 0.472001959]) if args.cuda: w_tensor = w_tensor.cuda() criterion = nn.NLLLoss(weight=w_tensor) else: criterion = nn.NLLLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) model_config = '\t'.join([ str(x) for x in (torch.__version__, args.clip, args.nlayers, args.din, args.demb, args.dhid, args.embinit, args.decinit, args.hidinit, args.dropout, args.optimizer, args.reweight, args.lr, args.vocabsize, args.batchsize, args.clean, args.rm_stops) ]) print( 'Pytorch | Clip | #Layers | InSize | EmbDim | HiddenDim | EncoderInit | DecoderInit | WeightInit | Dropout | Optimizer | Reweight | LR | VocabSize | batchsize | Clean | Stops' ) print(model_config) # best_val_acc = 0.78 best_ll = 0.3 for epoch in range(args.epochs): model.train() total_cost = 0 start_time = time.time() cur_loss = 0 for ind, (qs, duplicate) in enumerate(train_loader): model.zero_grad() pred = model(qs[:, 0, 0, :d_in].long(), qs[:, 0, 1, :d_in].long(), qs[:, 0, 2, :n_feat]) if args.cuda: pred = pred.cuda() duplicate = duplicate.cuda() duplicate = Variable(duplicate) loss = criterion(pred, duplicate) loss.backward() clip_grad_norm(model.parameters(), args.clip) if optimizer: optimizer.step() else: for p in model.parameters(): p.data.add_(-args.lr, p.grad.data) total_cost += loss.data[0] cur_loss += loss.data[0] if ind % args.loginterval == 0 and ind > 0: cur_loss = loss.data[0] / args.loginterval elapsed = time.time() - start_time print( '| Epoch {:3d} | {:5d}/{:5d} Batches | ms/batch {:5.2f} | ' 'Loss {:.6f}'.format(epoch, ind, num_train // args.batchsize, elapsed * 1000.0 / args.loginterval, cur_loss)) start_time = time.time() cur_loss = 0 model.eval() train_acc, train_ll = evaluate(model, train_loader, args.cuda, d_in, n_feat) val_acc, val_ll = evaluate(model, valid_loader, args.cuda, d_in, n_feat) # if args.save and (val_acc > best_val_acc): if args.save and (val_ll < best_ll): with open(args.save + '_corpus.pkl', 'wb') as corp_f: pkl.dump(corpus, corp_f, protocol=pkl.HIGHEST_PROTOCOL) torch.save(model.cpu(), args.save) torch.save(model.cpu().state_dict(), args.save + ".state_dict") with open(args.save + ".state_dict.config", "w") as f: f.write(model_config) best_ll = val_ll if args.cuda: model.cuda() print( 'Epoch: {} | Train Loss: {:.4f} | Train Accuracy: {:.4f} | Val Accuracy: {:.4f} | Train LL: {:.4f} | Val LL: {:.4f}' .format(epoch, total_cost, train_acc, val_acc, train_ll, val_ll)) print('-' * 89) del train_loader print('Reloading Best Model') model = torch.load(args.save) model.cuda() model.eval() print('RELOADING VALID') valid_data = pd.read_csv('../data/val_data_shuffle.csv') valid_data = valid_data.fillna(' ') q1_val = list(valid_data['question1'].map(str)) q2_val = list(valid_data['question2'].map(str)) y_val = list(valid_data['is_duplicate']) train_feat = pd.read_csv('../data/train_features_all_norm.csv') val_feat = train_feat.iloc[valid_data['id']].values if args.clean: print('Cleaning Data') stops = None if args.rm_stops: stops = stops = set(stopwords.words("english")) q1_val = [split_text(x, stops) for x in q1_val] q2_val = [split_text(x, stops) for x in q2_val] else: q1_val = [x.lower().split() for x in q1_val] q2_val = [x.lower().split() for x in q2_val] X_val = torch.Tensor(len(valid_data), 1, 3, feat_max) X_val[:, 0, 0, :] = torch.from_numpy(corpus.pad_numericalize(q1_val, feat_max)).long() X_val[:, 0, 1, :] = torch.from_numpy(corpus.pad_numericalize(q2_val, feat_max)).long() X_val[:, 0, 2, :n_feat] = torch.from_numpy(np.array(val_feat)) y_val = torch.from_numpy(np.array(y_val)).long() if args.cuda: X_val, y_val = X_val.cuda(), y_val.cuda() valid_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=args.batchsize, shuffle=False) del X_val, y_val, train_feat, val_feat, q1_val, q2_val, valid_data print('PREDICTING VALID') pred_list = [] for ind, (qs, _) in enumerate(valid_loader): out = model(qs[:, 0, 0, :d_in].long(), qs[:, 0, 1, :d_in].long(), qs[:, 0, 2, :n_feat]) pred_list += list(out.exp()[:, 1].data.cpu().numpy()) with open('../predictions/' + args.save + '_val.pkl', 'wb') as f: pkl.dump(pred_list, f, protocol=pkl.HIGHEST_PROTOCOL) if args.reweight: print('LOADING TEST DATA') test_data = pd.read_csv('../data/test.csv') test_data = test_data.fillna(' ') q1 = list(test_data['question1'].map(str)) q2 = list(test_data['question2'].map(str)) q1 = [x.lower().split() for x in q1] q2 = [x.lower().split() for x in q2] print('LOADING TEST FEATURES') test_feat = pd.read_csv('../data/test_features_all_norm.csv').values n_feat = test_feat.shape[1] d_in = args.din feat_max = int(np.max([n_feat, d_in])) X = torch.Tensor(len(test_data), 1, 3, feat_max) X[:, 0, 0, :] = torch.from_numpy(corpus.pad_numericalize(q1, feat_max)).long() X[:, 0, 1, :] = torch.from_numpy(corpus.pad_numericalize(q2, feat_max)).long() X[:, 0, 2, :n_feat] = torch.from_numpy(np.array(test_feat)) y = torch.LongTensor(len(test_data)).zero_() if args.cuda: X = X.cuda() y = y.cuda() test_loader = DataLoader(TensorDataset(X, y), batch_size=500, shuffle=False) print('PREDICTING') pred_list = [] for ind, (qs, _) in enumerate(test_loader): out = model(qs[:, 0, 0, :d_in].long(), qs[:, 0, 1, :d_in].long(), qs[:, 0, 2, :n_feat]) pred_list += list(out.exp()[:, 1].data.cpu().numpy()) with open('../predictions/' + args.save + '.pkl', 'wb') as f: pkl.dump(pred_list, f, protocol=pkl.HIGHEST_PROTOCOL)