コード例 #1
0
ファイル: infoextract.py プロジェクト: hausdorf/terr
def process_input_text(file_text, id_name):
    global KEY

    (meta, main) = preprocess.split_text(file_text)
    if not meta:
        print "ERROR IN SPLITTING MAIN AND META"
        return
    if not main:
        print "ERROR IN SPLITTING MAIN AND META"
        return

    file_text = re.sub(NEWLINE, " ", main)
    if DEBUG:
        print ("processing text", main)
        print ("")

    d = answr_dict()

    if not KEY:
        make_key()

    grammar = r"""
	NP: {<RB|PP\$>?<JJ>*<NN>+<POS>?}
	NP: {<RB|PP\$>?<JJ>*<NN>+<NNS>*}
	    {<NNP>+}
	    {<RB|PP\$>?<JJ>*<NNS>*<POS>?}
	"""
    # sents = map(pos_tag, map(word_tokenize, [s for s in sent_tokenize(file_text.lower())]))
    # cp = RegexpParser(grammar)
    # for s in sents:
    # 	print cp.parse(s)
    weapons = get_weapon(file_text, d)
    print weapons
    weapon = weapons[0][0]
    print id_name
    print "C", KEY[id_name], "\n", "D", weapon
    print
    # perpindiv = get_perp_indiv(file_text, d)
    perpindiv = "-"
    # perporg = get_perp_org(file_text, d)
    perporg = "-"
    # targets = get_target(file_text, d)
    # target = targets[0][0]
    target = "-"
    # victims = get_victim(file_text, d)
    # victim = victims[0][0]
    victim = "-"

    incident_type = incident_predictor.get_predicted_event(main)
    print_out(id_name, incident_type, weapon, perpindiv, perporg, target, victim)
コード例 #2
0
ファイル: infoextract2_ner.py プロジェクト: hausdorf/terr
def process_input_text(file_text,id_name):
	# remove the \n from in between the lines
	(meta,main) = preprocess.split_text(file_text)
	if (not meta):
		print "ERROR IN SPLITTING MAIN AND META"
		return 
	if(not main):
		print "ERROR IN SPLITTING MAIN AND META"
		return
	#print proc_meta(meta)
		
	temp_victim_list = []
	final_victim_set =set([])
	temp_target_list = []
	final_target_set = set([])
	temp_perpi_list = []
	final_perpi_set = set([])

	file_text = re.sub(NEWLINE," ",main)
	file_text_list = file_text.split('\n')
	if(DEBUG):
		print ("processing text",main) 
		print ("")
	
	# pass file text instead of main in infoextract2.py 	
	incident_type = incident_predictor.get_predicted_event(main) 
	# TODO NER CALL A FUNCTION THAT returns NER DICT
	ner_tagged_text = process_ner.java_ner_tagger(file_text)
	if (ner_tagged_text):
		ner_tagged_text.strip()
		if(ner_tagged_text):
			ner_dict = process_ner.get_entities()

	if(ner_dict):
		print ner_dict
	# open file containing victim patterns
	text = utility.f_read('victim_out_patterns_regex2')
  	victim_patt_lines = text.split('\n')
	text = utility.f_read('target_out_patterns_regex2') # has only back patt
  	target_patt_lines = text.split('\n')
	text = utility.f_read('perp_out_patterns_regex2') # has both front and back patterns 
  	perp_patt_lines = text.split('\n')
	# ALGO read one line at a time .. if it matches one of the patterns then parse that line and do ur thing 


	# READ EACH LINE IN THE from input file   
	for line in file_text_list:
		line = line.strip()
		if(not line):
			continue

		# split each line into several sentences
		sents = utility.sent_splitter(line)
		for sent in sents:
			#print "processing line",line	
			# make sure no consecutive white spaces in ur line
			sent  = sent.strip()
			# TODO remove 's and `` from sentence remove `` as well ?
			sent = re.sub(SPATT,"",sent)			
			input_line = re.sub(COLL_SPACES,SPACES_REPL,sent)
			temp_victim_list = pattern_extractor.get_victims(input_line,victim_patt_lines)
			if temp_victim_list:
				for victim in temp_victim_list:
					victim  = victim.strip()
					if victim:
						final_victim_set.add(victim)
			# TARGET LIST
			temp_target_list = pattern_extractor.get_targets(input_line,target_patt_lines)
			if temp_target_list:
				for target in temp_target_list:
					target = target.strip()
					if target:
						final_target_set.add(target)
			# PERPI LIST
			temp_perpi_list = pattern_extractor.get_perpi(input_line,perp_patt_lines)
			if temp_perpi_list:
				for perp in temp_perpi_list:
					perp = perp.strip()
					if perp:
						final_perpi_set.add(perp)


			# now use algorithms to clean this list and to remove redundant stuff 
			# get target_list
	# a victim cannot be an org or location ?? has to be  a person 

	#subset removal
	v_new_list = list(final_victim_set)
	v_new_list  = utility.remove_subsets(v_new_list)	
	print "after subset removal"
	print v_new_list
	v_new_list = utility.remove_syn(v_new_list)
	print "after duplicate removal for ",id_name
	print v_new_list

	v_new_list = utility.rmv_flagged_np(v_new_list,'victim')# e.g headquarters
	print "after removing flag words   for ",id_name
	print v_new_list

	v_new_list = utility.first_word_flag(v_new_list,'victim')# e.g suspects 
	print "after one removing first word flags  for ",id_name
	print v_new_list

	v_new_list = utility.first_word_rmv(v_new_list)# e.g COLONEL REPORTER
	print "after removing first title words like COLONEL etc ",id_name
	print v_new_list

	v_new_list = utility.one_word_cleaner(v_new_list)
	print "after one word and digit removal for ",id_name
	print v_new_list
	v_new_list = utility.victim_hacks(v_new_list)# e.g hacks
	print "after adding some hacks make unique",id_name
	print v_new_list
	print "###########################"

	# a target cannot be a a person or location 

	t_new_list  = list(final_target_set)
	t_new_list  = utility.remove_subsets(t_new_list)	
	print "after subset removal"
	print t_new_list
	t_new_list = utility.remove_syn(t_new_list)
	print "after duplicate removal"
	print t_new_list


	t_new_list = utility.rmv_flagged_np(t_new_list,'target')# e.g headquarters
	print "after removing flag words   for ",id_name
	print t_new_list
	t_new_list = utility.first_word_flag(t_new_list,'target')# e.g suspects 
	print "after one removing first word flags  for ",id_name
	print t_new_list

	t_new_list = utility.one_word_cleaner(t_new_list)
	print "###Final after one word removal for ",id_name
	print t_new_list
	#print "###########################"


	# NER HINT a perpetrator cannot be a LOCATION or an org ??

	p_new_list  = list(final_perpi_set)
	p_new_list  = utility.remove_subsets(p_new_list)	
	print "after subset removal"
	print p_new_list
	p_new_list = utility.remove_syn(p_new_list)
	print "after duplicate removal"
	print p_new_list

	p_new_list = utility.rmv_flagged_np(p_new_list,'perp')# e.g headquarters
	print "after removing flag words   for ",id_name
	print p_new_list
	p_new_list = utility.first_word_flag(p_new_list,'perp')# e.g suspects 
	print "after one removing first word flags  for ",id_name
	print p_new_list

	p_new_list = utility.one_word_cleaner(p_new_list)
	print " Final after one word and digit removal for ",id_name
	print p_new_list
	#print "###########################"


	#dict_out    = matching.match(parsed_text)
	#print ("")
	print_outf(id_name,incident_type,[],p_new_list,[],t_new_list,v_new_list)
コード例 #3
0
def main():
    parser = argparse.ArgumentParser(
        description='PyTorch PennTreeBank RNN/LSTM Language Model')
    parser.add_argument('--data',
                        type=str,
                        default='../data/',
                        help='location of the data corpus')
    parser.add_argument('--presaved',
                        action='store_true',
                        help='use presaved data')
    parser.add_argument('--glovedata',
                        type=str,
                        default='../data/',
                        help='location of the pretrained glove embeddings')
    parser.add_argument('--din', type=int, default=30, help='length of LSTM')
    parser.add_argument('--demb',
                        type=int,
                        default=300,
                        help='size of word embeddings')
    parser.add_argument('--dhid',
                        type=int,
                        default=300,
                        help='number of hidden units per layer')
    parser.add_argument('--dlin',
                        type=int,
                        default=500,
                        help='number linear transformation nodes')
    parser.add_argument('--dout',
                        type=int,
                        default=2,
                        help='number of output classes')
    parser.add_argument('--nlayers',
                        type=int,
                        default=1,
                        help='number of layers')
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        help='initial learning rate')
    parser.add_argument('--wd',
                        type=float,
                        default=0.0,
                        help='adam l2 weight decay')
    parser.add_argument('--clip',
                        type=float,
                        default=0.25,
                        help='gradient clipping')
    parser.add_argument('--embinit',
                        type=str,
                        default='random',
                        help='embedding weight initialization type')
    parser.add_argument('--decinit',
                        type=str,
                        default='random',
                        help='decoder weight initialization type')
    parser.add_argument('--hidinit',
                        type=str,
                        default='random',
                        help='recurrent hidden weight initialization type')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.0,
                        help='dropout applied to layers (0 = no dropout)')
    parser.add_argument('--rnn', type=str, default='lstm', help='lstm or gru')
    parser.add_argument('--epochs',
                        type=int,
                        default=40,
                        help='upper epoch limit')
    parser.add_argument('--batchsize',
                        type=int,
                        default=2000,
                        metavar='N',
                        help='batch size')
    parser.add_argument('--seed', type=int, default=3, help='random seed')
    parser.add_argument('--vocabsize',
                        type=int,
                        default=200000,
                        help='random seed')
    parser.add_argument('--optimizer',
                        action='store_true',
                        help='use ADAM optimizer')

    parser.add_argument('--reweight',
                        action='store_true',
                        help='reweight loss function')
    parser.add_argument('--clean', action='store_true', help='clean text')
    parser.add_argument('--rm_stops',
                        action='store_true',
                        help='remove stop words')

    parser.add_argument('--bidir', action='store_false', help='bidirectional')
    parser.add_argument('--freezeemb',
                        action='store_false',
                        help='freezes embeddings')
    parser.add_argument('--cuda', action='store_true', help='use CUDA')
    parser.add_argument('--loginterval',
                        type=int,
                        default=100,
                        metavar='N',
                        help='report interval')
    parser.add_argument('--save',
                        type=str,
                        default='',
                        help='path to save the final model')
    args = parser.parse_args()

    pipe = None
    corpus = TacoText(args.vocabsize, lower=True, vocab_pipe=pipe)
    train_data = pd.read_csv('../data/train_data_shuffle.csv')
    valid_data = pd.read_csv('../data/val_data_shuffle.csv')
    train_data = train_data.fillna(' ')
    valid_data = valid_data.fillna(' ')

    if args.reweight:
        print('Downsampling')
        #downsample
        pos_valid = valid_data[valid_data['is_duplicate'] == 1]
        neg_valid = valid_data[valid_data['is_duplicate'] == 0]
        p = 0.19
        pl = len(pos_valid)
        tl = len(pos_valid) + len(neg_valid)
        val = int(pl - (pl - p * tl) / ((1 - p)))
        pos_valid = pos_valid.iloc[:int(val)]
        valid_data = pd.concat([pos_valid, neg_valid])

    print('Splitting Train')
    q1 = list(train_data['question1'].map(str))
    q2 = list(train_data['question2'].map(str))
    y = list(train_data['is_duplicate'])

    print('Splitting Valid')
    q1_val = list(valid_data['question1'].map(str))
    q2_val = list(valid_data['question2'].map(str))
    y_val = list(valid_data['is_duplicate'])

    train_feat = pd.read_csv('../data/train_features_all_norm.csv')
    val_feat = train_feat.iloc[valid_data['id']].values
    train_feat = train_feat.iloc[train_data['id']].values

    print('Splitting Data')
    if args.clean:
        print('Cleaning Data')
        stops = None
        if args.rm_stops:
            stops = stops = set(stopwords.words("english"))
        q1 = [split_text(x, stops) for x in q1]
        q2 = [split_text(x, stops) for x in q2]
        q1_val = [split_text(x, stops) for x in q1_val]
        q2_val = [split_text(x, stops) for x in q2_val]
    else:
        q1 = [x.lower().split() for x in q1]
        q2 = [x.lower().split() for x in q2]
        q1_val = [x.lower().split() for x in q1_val]
        q2_val = [x.lower().split() for x in q2_val]

    print('Downsample Weight: ', np.mean(y_val))

    corpus.gen_vocab(q1 + q2 + q2_val + q1_val)

    n_feat = train_feat.shape[1]
    d_in = args.din
    feat_max = int(np.max([n_feat, d_in]))

    X = torch.Tensor(len(train_data), 1, 3, feat_max)
    X[:, 0, 0, :] = torch.from_numpy(corpus.pad_numericalize(q1,
                                                             feat_max)).long()
    X[:, 0, 1, :] = torch.from_numpy(corpus.pad_numericalize(q2,
                                                             feat_max)).long()
    X[:, 0, 2, :n_feat] = torch.from_numpy(np.array(train_feat))
    y = torch.from_numpy(np.array(y)).long()

    X_val = torch.Tensor(len(valid_data), 1, 3, feat_max)
    X_val[:, 0,
          0, :] = torch.from_numpy(corpus.pad_numericalize(q1_val,
                                                           feat_max)).long()
    X_val[:, 0,
          1, :] = torch.from_numpy(corpus.pad_numericalize(q2_val,
                                                           feat_max)).long()
    X_val[:, 0, 2, :n_feat] = torch.from_numpy(np.array(val_feat))
    y_val = torch.from_numpy(np.array(y_val)).long()

    if args.cuda:
        X, y = X.cuda(), y.cuda()
        X_val, y_val = X_val.cuda(), y_val.cuda()

    print('Generating Data Loaders')
    #X.size len(train_data),1,2,fix_length
    train_dataset = TensorDataset(X, y)
    train_loader = DataLoader(train_dataset,
                              batch_size=args.batchsize,
                              shuffle=True)
    valid_loader = DataLoader(TensorDataset(X_val, y_val),
                              batch_size=args.batchsize,
                              shuffle=False)

    num_train = len(X)

    del X, y, X_val, y_val, train_feat, val_feat, q1, q2, q1_val, q2_val

    ntokens = len(corpus)
    glove_embeddings = None
    if args.embinit == 'glove':
        assert args.demb in (50, 100, 200, 300)
        glove_embeddings = get_glove_embeddings(args.glovedata,
                                                corpus.dictionary.word2idx,
                                                ntokens, args.demb)

    model = ConvRNNLSTMFeat(args.din, args.dhid, args.dout, args.demb,
                            args.dlin, args.vocabsize, args.dropout,
                            args.embinit, args.hidinit, args.decinit,
                            glove_embeddings, args.cuda, args.rnn, args.bidir,
                            n_feat)

    if args.cuda:
        model.cuda()

    if args.reweight:
        w_tensor = torch.Tensor([1.309028344, 0.472001959])
        if args.cuda:
            w_tensor = w_tensor.cuda()
        criterion = nn.NLLLoss(weight=w_tensor)
    else:
        criterion = nn.NLLLoss()

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.wd)

    model_config = '\t'.join([
        str(x) for x in (torch.__version__, args.clip, args.nlayers, args.din,
                         args.demb, args.dhid, args.embinit, args.decinit,
                         args.hidinit, args.dropout, args.optimizer,
                         args.reweight, args.lr, args.vocabsize,
                         args.batchsize, args.clean, args.rm_stops)
    ])

    print(
        'Pytorch | Clip | #Layers | InSize | EmbDim | HiddenDim | EncoderInit | DecoderInit | WeightInit | Dropout | Optimizer | Reweight | LR | VocabSize | batchsize | Clean | Stops'
    )
    print(model_config)

    # best_val_acc = 0.78
    best_ll = 0.3
    for epoch in range(args.epochs):
        model.train()
        total_cost = 0
        start_time = time.time()
        cur_loss = 0
        for ind, (qs, duplicate) in enumerate(train_loader):
            model.zero_grad()
            pred = model(qs[:, 0, 0, :d_in].long(), qs[:, 0, 1, :d_in].long(),
                         qs[:, 0, 2, :n_feat])
            if args.cuda:
                pred = pred.cuda()
                duplicate = duplicate.cuda()
            duplicate = Variable(duplicate)
            loss = criterion(pred, duplicate)
            loss.backward()
            clip_grad_norm(model.parameters(), args.clip)

            if optimizer:
                optimizer.step()
            else:
                for p in model.parameters():
                    p.data.add_(-args.lr, p.grad.data)

            total_cost += loss.data[0]
            cur_loss += loss.data[0]

            if ind % args.loginterval == 0 and ind > 0:
                cur_loss = loss.data[0] / args.loginterval
                elapsed = time.time() - start_time
                print(
                    '| Epoch {:3d} | {:5d}/{:5d} Batches | ms/batch {:5.2f} | '
                    'Loss {:.6f}'.format(epoch, ind,
                                         num_train // args.batchsize,
                                         elapsed * 1000.0 / args.loginterval,
                                         cur_loss))
                start_time = time.time()
                cur_loss = 0

        model.eval()
        train_acc, train_ll = evaluate(model, train_loader, args.cuda, d_in,
                                       n_feat)
        val_acc, val_ll = evaluate(model, valid_loader, args.cuda, d_in,
                                   n_feat)
        # if args.save and (val_acc > best_val_acc):
        if args.save and (val_ll < best_ll):
            with open(args.save + '_corpus.pkl', 'wb') as corp_f:
                pkl.dump(corpus, corp_f, protocol=pkl.HIGHEST_PROTOCOL)
            torch.save(model.cpu(), args.save)
            torch.save(model.cpu().state_dict(), args.save + ".state_dict")
            with open(args.save + ".state_dict.config", "w") as f:
                f.write(model_config)
            best_ll = val_ll
            if args.cuda:
                model.cuda()

        print(
            'Epoch: {} | Train Loss: {:.4f} | Train Accuracy: {:.4f} | Val Accuracy: {:.4f} | Train LL: {:.4f} | Val LL: {:.4f}'
            .format(epoch, total_cost, train_acc, val_acc, train_ll, val_ll))
        print('-' * 89)

    del train_loader

    print('Reloading Best Model')
    model = torch.load(args.save)
    model.cuda()
    model.eval()

    print('RELOADING VALID')

    valid_data = pd.read_csv('../data/val_data_shuffle.csv')
    valid_data = valid_data.fillna(' ')

    q1_val = list(valid_data['question1'].map(str))
    q2_val = list(valid_data['question2'].map(str))
    y_val = list(valid_data['is_duplicate'])

    train_feat = pd.read_csv('../data/train_features_all_norm.csv')
    val_feat = train_feat.iloc[valid_data['id']].values

    if args.clean:
        print('Cleaning Data')
        stops = None
        if args.rm_stops:
            stops = stops = set(stopwords.words("english"))
        q1_val = [split_text(x, stops) for x in q1_val]
        q2_val = [split_text(x, stops) for x in q2_val]
    else:
        q1_val = [x.lower().split() for x in q1_val]
        q2_val = [x.lower().split() for x in q2_val]

    X_val = torch.Tensor(len(valid_data), 1, 3, feat_max)
    X_val[:, 0,
          0, :] = torch.from_numpy(corpus.pad_numericalize(q1_val,
                                                           feat_max)).long()
    X_val[:, 0,
          1, :] = torch.from_numpy(corpus.pad_numericalize(q2_val,
                                                           feat_max)).long()
    X_val[:, 0, 2, :n_feat] = torch.from_numpy(np.array(val_feat))
    y_val = torch.from_numpy(np.array(y_val)).long()

    if args.cuda:
        X_val, y_val = X_val.cuda(), y_val.cuda()

    valid_loader = DataLoader(TensorDataset(X_val, y_val),
                              batch_size=args.batchsize,
                              shuffle=False)

    del X_val, y_val, train_feat, val_feat, q1_val, q2_val, valid_data

    print('PREDICTING VALID')
    pred_list = []
    for ind, (qs, _) in enumerate(valid_loader):
        out = model(qs[:, 0, 0, :d_in].long(), qs[:, 0, 1, :d_in].long(),
                    qs[:, 0, 2, :n_feat])
        pred_list += list(out.exp()[:, 1].data.cpu().numpy())

    with open('../predictions/' + args.save + '_val.pkl', 'wb') as f:
        pkl.dump(pred_list, f, protocol=pkl.HIGHEST_PROTOCOL)

    if args.reweight:
        print('LOADING TEST DATA')
        test_data = pd.read_csv('../data/test.csv')
        test_data = test_data.fillna(' ')
        q1 = list(test_data['question1'].map(str))
        q2 = list(test_data['question2'].map(str))
        q1 = [x.lower().split() for x in q1]
        q2 = [x.lower().split() for x in q2]

        print('LOADING TEST FEATURES')
        test_feat = pd.read_csv('../data/test_features_all_norm.csv').values

        n_feat = test_feat.shape[1]
        d_in = args.din
        feat_max = int(np.max([n_feat, d_in]))

        X = torch.Tensor(len(test_data), 1, 3, feat_max)
        X[:, 0,
          0, :] = torch.from_numpy(corpus.pad_numericalize(q1,
                                                           feat_max)).long()
        X[:, 0,
          1, :] = torch.from_numpy(corpus.pad_numericalize(q2,
                                                           feat_max)).long()
        X[:, 0, 2, :n_feat] = torch.from_numpy(np.array(test_feat))
        y = torch.LongTensor(len(test_data)).zero_()

        if args.cuda:
            X = X.cuda()
            y = y.cuda()

        test_loader = DataLoader(TensorDataset(X, y),
                                 batch_size=500,
                                 shuffle=False)

        print('PREDICTING')
        pred_list = []
        for ind, (qs, _) in enumerate(test_loader):
            out = model(qs[:, 0, 0, :d_in].long(), qs[:, 0, 1, :d_in].long(),
                        qs[:, 0, 2, :n_feat])
            pred_list += list(out.exp()[:, 1].data.cpu().numpy())

        with open('../predictions/' + args.save + '.pkl', 'wb') as f:
            pkl.dump(pred_list, f, protocol=pkl.HIGHEST_PROTOCOL)