Beispiel #1
0
def create_tagging_schema(directory_source,source,attributes_chosed,raf=False,raf_attribute="",name=""):
    files = os.listdir(directory_source)
    source_sentences = []
    source_example_counts = dict()
    print("Tagging al the sentence of source: "+source+" ...")
    for filename in [file for file in files if file.endswith(".json")]:
        js = utils.open_json(filename,source)
        for attribute in js:
            if raf:
                if isinstance(js[attribute],str) and attribute!="<page title>" and attribute==raf_attribute:
                    sentence = []
                    for token in utils.tokenizer(attribute):
                        sentence.append((token,"O"))
                    sentence.append(("ENDNAME","O"))
                    sentence = sentence + tag_sentence(attribute,js[attribute],attributes_chosed,source,raf,raf_attribute,name)
                    sentence.append(("ENDVALUE","O"))
                    if useful(sentence)>0:
                        source_sentences.append((raf_attribute,sentence))
                        source_example_counts.setdefault(raf_attribute,0)
                        source_example_counts[raf_attribute]=source_example_counts[raf_attribute]+1
            else:
                if isinstance(js[attribute],str) and attribute!="<page title>" and [t for t in d.get_predicate_name(attribute,source,True) if t in attributes_chosed]:
                    sentence = []
                    for token in utils.tokenizer(attribute):
                        sentence.append((token,"O"))
                    sentence.append(("ENDNAME","O"))
                    sentence = sentence + tag_sentence(attribute,js[attribute],attributes_chosed,source)
                    sentence.append(("ENDVALUE","O"))
                    if useful(sentence)>0:
                        p_name = d.get_predicate_name(attribute,source,True)[0]
                        source_sentences.append((p_name,sentence))
                        source_example_counts.setdefault(p_name,0)
                        source_example_counts[p_name]=source_example_counts[p_name]+1
    return (source_example_counts,source_sentences)
Beispiel #2
0
def train():
    training_ls = dir_reader(TRAIN_DIR)
    relations = relation_reader(cache=RELATIONS)
    assert relations['Other'] == 0
    assert Relation_type == len(relations)
    print(relations)

    train_ls, val_ls = training_ls[:len(training_ls) -
                                   800], training_ls[len(training_ls) - 800:]
    #train_ls, val_ls = training_ls[:2], training_ls[:10]
    train_data = tokenizer((train_ls, TRAIN_DIR),
                           relations,
                           pretrain_type='elmo_repre')
    val_data = tokenizer((val_ls, TRAIN_DIR),
                         relations,
                         pretrain_type='elmo_repre')

    print('%d training data, %d validation data' %
          (len(train_data.data), len(val_data.data)),
          flush=True)

    LSTM_layer = SeqLayer(ELMo_size, Hidden_size, Hidden_layer, Dropout,
                          Bidirection).cuda()
    RE = RelationDetect_woemb(Hidden_size, Relation_type, Hidden_size,
                              Dropout).cuda()

    print('network initialized', flush=True)

    #LSTM_layer.load_state_dict(torch.load(SAVE_DIR + 'LSTM_499'))
    #RE.load_state_dict(torch.load(SAVE_DIR + 'RE_499'))

    if os.path.exists(LOG_FILE):
        os.remove(LOG_FILE)

    end2end(train_data, val_data, LSTM_layer, RE, Learning_rate, Epoch)
Beispiel #3
0
def add_false_examples(set,target):
    sentences = []
    for s in [x[1] for x in os.walk(config["DIRECTORY_DATASET"])][0]:
        if s!="www.ebay.com" and s!="www.alibaba.com":
            directory_source = config["DIRECTORY_DATASET"] + s
            files = os.listdir(directory_source)
            print(directory_source+"...")
            for filename in [file for file in files if file.endswith(".json")]:
                js = utils.open_json(filename,s)
                sentence = []
                for attribute in js:
                    if d.get_predicate_name(attribute,s,False)[0]==target and isinstance(js[attribute],str) and js[attribute]!="Black":
                        for token in utils.tokenizer(attribute):
                            sentence.append((token,"O"))
                        sentence.append(("ENDNAME","O"))
                        for token in utils.tokenizer(js[attribute]):
                            sentence.append((token,"O"))
                        sentence.append(("ENDVALUE","O"))
                sentences.append(sentence)
    with open("dataset/"+set+"_set.txt","a") as f:
        for sentence in sentences:
            for (token,tag) in sentence:
                f.write(token+"\t"+tag+"\n")
            f.write("\n")
    f.close()
Beispiel #4
0
def main(args):

    device = flow.device("cpu") if args.no_cuda else flow.device("cuda")
    with open(args.config_path, "r") as f:
        config = json.load(f)
    with open(args.vocab_path, "rb") as f:
        vocab = pickle.load(f)
    textcnn = textCNN(
        word_emb_dim=config["word_emb_dim"],
        vocab_size=len(vocab),
        dim_channel=config["dim_channel"],
        kernel_wins=config["kernel_wins"],
        dropout_rate=config["dropout_rate"],
        num_class=config["num_class"],
        max_seq_len=config["max_seq_len"],
    )
    textcnn.load_state_dict(flow.load(args.model_path))
    textcnn.eval()
    textcnn.to(device)
    text = utils.clean_str(args.text)
    text = [utils.tokenizer(text)]
    input = flow.tensor(np.array(utils.tensorize_data(text, vocab,
                                                      max_len=200)),
                        dtype=flow.long).to(device)
    predictions = textcnn(input).softmax()
    predictions = predictions.numpy()
    clsidx = np.argmax(predictions)
    print("predict prob: %f, class name: %s" % (np.max(predictions), clsidx))
Beispiel #5
0
def syntactic_test(path,
                   syntactic_model,
                   syntactic_device,
                   vocab_mapping,
                   batch_size: int = 72):
    """
    Args:
        syntactic_model: model tested
        vocab_mapping: dictionary mapping words to unique integers
        syntactic_device: computing device
        batch_size: batch size to use while computing logprobs

    Returns: list of log probabilities assigned to each sentence
    """

    # load & tokenize stimuli
    test_sentences = load_sentences(path)
    tokenized_sentences = tokenizer(test_sentences)
    encoded_tokens = encode_words(tokenized_sentences, vocab_mapping)
    print("number of sentences after encoding tokens:", len(encoded_tokens),
          len(encoded_tokens[-1]))

    num_steps = math.ceil(len(encoded_tokens) / batch_size)
    all_probs = []
    for i in tqdm.trange(num_steps, desc="Computing logprobs"):
        sent_tok_ids, logprobs = get_words_logprobs(
            encoded_tokens[i * batch_size:(i + 1) * batch_size],
            syntactic_model, vocab_mapping, syntactic_device)
        all_probs.extend(get_sentences_probs(sent_tok_ids, logprobs))

    return all_probs
Beispiel #6
0
    def __init__(self):

        self.token_en = utils.tokenizer('en')
        self.token_de = utils.tokenizer('de')
        device = torch.device(
            ('cuda' if torch.cuda.is_available() == True else 'cpu'))
        print(self.token_en.get_tokenizer()("I LOVE U"))
        self.SRC = utils.create_filed(self.token_de.get_list)
        self.TRG = utils.create_filed(self.token_en.get_list)
        self.train_data, self.valid_data, self.test_data = Multi30k.splits(
            exts=('.de', '.en'), fields=(self.SRC, self.TRG))
        self.SRC.build_vocab(self.train_data, min_freq=MIN_FREQ)
        self.TRG.build_vocab(self.train_data, min_freq=MIN_FREQ)
        self.train_iterator, self.valid_iterator, self.test_iterator = BucketIterator.splits(
            (self.train_data, self.valid_data, self.test_data),
            batch_size=BATCH_SIZE,
            device=device)
Beispiel #7
0
 def predict_batch(self, list_sentences, all_words, word2idx, idx2tag):
     sent_token, sent_matrix = utils.tokenizer(list_sentences, all_words,
                                               word2idx, self.MAX_LENGTH)
     predict = self.sess.run(tf.argmax(self.predict, 2),
                             feed_dict={self.X: sent_matrix})
     # convert to tag
     tags = []
     for i in range(len(predict)):
         tag_predict = []
         for j in range(len(sent_token[i])):
             tag_predict.append(idx2tag[predict[i][j]])
         tags.append(tag_predict)
     return sent_token, tags
Beispiel #8
0
def _token_chunks(s: str, s2=None, add_special_tokens=False):
    """
    Helper function to tokenize without special tokens and returning
    only a numpy array for speed.
    """

    text = s if s2 is None else (s, s2)

    tokens = utils.tokenizer(
        [text],
        return_tensors="np",
        truncation="only_first",
        add_special_tokens=add_special_tokens,
    )

    return utils.tokenizer.convert_ids_to_tokens(tokens["input_ids"][0])
Beispiel #9
0
 def transcribe(self, recognizer):
     """
     use kaldi asr model to transcribe pcm_data
     for model structure check https://alphacephei.com/vosk/models.html
     input:
         pcm_data,
         instance of KaldiRecognizer,
         instance of nltk PorterStemmer
     output:
         [list of single-word Strings]
     """
     # ASR
     recognizer.AcceptWaveform(self.bytes)
     utterance = loads(recognizer.Result())['text']
     # tokenize into a list of 'words'
     self.transcript.extend(tokenizer(utterance))
Beispiel #10
0
def best_tagging(attribute,value,tokens_tag,attributes_chosed,source,raf,raf_attribute,name):
    tokens = utils.tokenizer(value)
    l = 0
    json_sentence = []
    single_value = single_value_tag(attribute,tokens,tokens_tag,attributes_chosed,source,raf,raf_attribute,name)
    while l<len(tokens)-1:
        current = value.rsplit(' ',l)[0]
        predicate_names = [t_attr for (v,t_attr) in tokens_tag if v==current]
        if not raf:
            predicate_name = d.coeherent_attribute(attribute,source)
            if predicate_name in predicate_names:
                temp = 0
                last_tag = "O"
                for token in tokens:
                    if token in current and not re.match("[,:;()\\\/]",token):
                        if last_tag=="O":
                            json_sentence.append((token,"B-"+predicate_name))
                            last_tag = "B"
                        else:
                            json_sentence.append((token,"I-"+predicate_name))
                            last_tag = "I"
                    else:
                        json_sentence.append((token,"O"))
                        last_tag = "O"
                    temp+=1
        else:
            if name in predicate_names:
                temp = 0
                last_tag = "O"
                for token in tokens:
                    if token in current and not re.match("[,:;()\\\/]",token):
                        if last_tag=="O":
                            json_sentence.append((token,"B-"+name))
                            last_tag = "B"
                        else:
                            json_sentence.append((token,"I-"+name))
                            last_tag = "I"
                    else:
                        json_sentence.append((token,"O"))
                        last_tag = "O"
                    temp+=1
        if useful(json_sentence)>useful(single_value):
            return json_sentence
        l+=1
    return single_value
Beispiel #11
0
def predict_fn(input_data, model):
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    tokenized = tokenizer(input_data)
    
    numericalized = [model.txt_field.vocab.stoi[t] for t in tokenized] 
    sentence_length = torch.LongTensor([len(numericalized)]).to(device) 
    tensor = torch.LongTensor(numericalized).unsqueeze(1).to(device) 
    
    model.to(device)
    model.eval()
    with torch.no_grad():
        forwardpass = model(tensor, sentence_length)
    
    _, indices = torch.topk(torch.sigmoid(forwardpass),k=3)
    tags = [model.label_field.vocab.itos[t] for t in indices.tolist()[0]]
    return tags
Beispiel #12
0
    def __init__(self, topic, comma=False, num=False):
        self.origin = topic

        # 全角变半角
        half_topic = utils.fullToHalf(topic)

        # 空格变逗号
        self.rep_topic = half_topic
        if comma:
            self.rep_topic = utils.spacesToComma(half_topic)

        # 分字
        self.words = utils.tokenizer(self.rep_topic)

        # 变换大小写和数字
        self.rep_words = [w.lower() for w in self.words]
        if num:
            self.rep_words = utils.replaceCaseAndNums(self.words)

        self.len = len(self.rep_words)
        self.rep_string = " ".join(self.rep_words).strip()
Beispiel #13
0
	def __init__(self, file_path, vocab, tokenizer):
		self.file_path = file_path
		self.data = []
		self.vocab = vocab
		self.tokenizer = tokenizer

		# open text file
		file = open(self.file_path, 'r', encoding='utf-8')

		lines = file.read()
		lines = lines.split("\n")

		datasets = []
		now = ""
		for i, line in enumerate(lines):
			if i % 30 == 0 and i != 0:
				datasets.append(now)
				now = ""
				continue
			now = now + "\n" + line


		print("tokenizer ending")

		# use for loop to iterate array of lines
		for line in datasets:
			if not line:
				break
			if len(line) < 3:
				continue

			toeknized_line = tokenizer(line[:-1])

			index_of_words = [vocab[vocab.bos_token], ] + vocab[toeknized_line] + [vocab[vocab.eos_token]]
			self.data.append(index_of_words)

		print(np.shape(self.data))

		file.close()
Beispiel #14
0
def processing_data(infile, labelfile, outfile, vocab_file, stopwords_file):
    print('Loading stopwords...')
    stopwords = get_stopwords(stopwords_file)

    print('Loading data...')
    data = pd.read_csv(infile)

    print('Saving labels')
    with open(labelfile, 'w') as f:
        for label in data.columns[2:]:
            f.write(label + '\n')

    # 把句子分割成词
    print('Splitting content')
    contents = data['content'].tolist()
    seg_contents = segmentData(contents, stopwords)

    if not os.path.exists(vocab_file):
        print('Creating vocabulary...')
        create_vocab(seg_contents, vocab_file, 50000)

    print('Loading vocabulary...')
    w2i, _ = read_vocab(vocab_file)

    # word2id
    print('Tokenize...')
    token_contents = [tokenizer(c, w2i) for c in seg_contents]
    data['content'] = token_contents

    # 把标签转换成one hot形式
    print('One-hot label')
    for col in data.columns[2:]:
        label = data[col].tolist()
        onehot_label = [onehot(l) for l in label]
        data[col] = onehot_label

    print('Saving...')
    data[data.columns[1:]].to_csv(outfile, index=False)
Beispiel #15
0
 def process_data(self, comment):
     seg_comment = segmentData([comment], self.stopwords)[0]
     tokens = tokenizer(seg_comment, self.w2i)
     return tokens
Beispiel #16
0
 def __init__(self, target_vocabulary):
     self.target_vocabulary = tokenizer(target_vocabulary)
     self.dictionary = {}
	frequency_by_doc = []

	for post in data:
		# get title
		if count_iter==len(random_idx):
			break

		if count == random_idx[count_iter]:
			count_iter+=1
			
			# procesado para core | PRUEBA ENERO
			text = '\n'.join([post.get('title',''),post.get('description','')])
			if text=='':
				print("Wot, texto vacio!!")
			# Tokenize and assign filter tags
			tokens = tokenizer(text)
			
			# procesado para core_tokenized | PRUEBA DE 1000
			#tokens = post['tokens']

			filtered_tokens = filterTokens(tokens,word_dict_filtered)
			# Title map and doc_title mapping
			title = ' '.join(tokens[0])[:50]
			doc_title = 'doc%i' % count_iter
			title_map_file.write(doc_title+'\n')
			doc_title_map[doc_title] = title
			
			## content to display
			content = [' '.join(sent) for sent in filtered_tokens]
			content = '<br>\n'.join(content)
			open(os.path.join(docs_dir,doc_title),'w').write(content)
def load_data():
    datasets = load_csv(DATA_PATH, filter_title=True, total=8000)
    X = [tokenizer(review) for label, review in datasets]
    y = [int(label) for label, review in datasets]
    return X, y
Beispiel #19
0
def test():
    test_ls = dir_reader(TEST_DIR)
    relations = relation_reader(cache=RELATIONS)
    assert relations['Other'] == 0
    assert Relation_type == len(relations)
    print(relations)

    test_data = tokenizer((test_ls, TEST_DIR),
                          relations,
                          pretrain_type='elmo_repre')
    print('%d test data' % len(test_data.data), flush=True)

    LSTM_layer = SeqLayer(ELMo_size, Hidden_size, Hidden_layer, Dropout,
                          Bidirection).cuda()
    RE = RelationDetect_woemb(Hidden_size, Relation_type, Hidden_size,
                              Dropout).cuda()

    LSTM_layer.load_state_dict(
        torch.load(SAVE_DIR + 'LSTM' + MODEL_NAME + '999'))
    RE.load_state_dict(torch.load(SAVE_DIR + 'RE' + MODEL_NAME + '999'))

    print('network initialized', flush=True)

    if os.path.exists(TEST_LOG_FILE):
        os.remove(TEST_LOG_FILE)

    test_data.reset_epoch()
    LSTM_layer.eval()
    RE.eval()
    TP = [[0.] * Relation_type for _ in range(len(Relation_threshold))]
    FP = [[0.] * Relation_type for _ in range(len(Relation_threshold))]
    FN = [[0.] * Relation_type for _ in range(len(Relation_threshold))]
    F1 = [[0.] * Relation_type for _ in range(len(Relation_threshold))]
    Precision = [[0.] * Relation_type for _ in range(len(Relation_threshold))]
    Recall = [[0.] * Relation_type for _ in range(len(Relation_threshold))]
    total_F1 = [0.] * len(Relation_threshold)
    micro_F1 = [0.] * len(Relation_threshold)
    total_F1_9 = [0.] * len(Relation_threshold)
    micro_F1_9 = [0.] * len(Relation_threshold)
    macro_F1_9 = [0.] * len(Relation_threshold)
    precision_9 = [0.] * len(Relation_threshold)
    recall_9 = [0.] * len(Relation_threshold)
    while not test_data.epoch_finish:
        standard_emb, e_label, e_posi, r_label, seq_length, mask, seq_pos = test_data.get_batch(
            Batch_size)
        #print(standard_emb.size())
        #print(e_label)
        #print(e_posi, r_label, seq_length)
        #input()
        ctx = LSTM_layer(standard_emb, seq_length)

        # get relationship
        for i in range(Batch_size):
            '''# take NER into computation
            for s in range(1, seq_length[i] + 1):  # s is the count of word number
                if s - 1 in e_posi[i][0] and e_posi[i][0][0] > e_posi[i][1][0]:
                    gts = [(posi, r_label[i]) for posi in e_posi[i][1]]
                elif s - 1 in e_posi[i][1] and e_posi[i][1][0] > e_posi[i][0][0]:
                    gts = [(posi, r_label[i]) for posi in e_posi[i][0]]
                else:
                    gts = [((s - 1), 0)]
                #print(gts)

                u = RE(ctx[i:i + 1, :s, :])
                result = nn.Softmax(dim=-1)(u[0, :, :].view(-1))
                #print(result)
                #print(result.size())
                #input()

                for j, th in enumerate(Relation_threshold):
                    candidates = (result > th).nonzero()
                    #print(candidates)
                    for location, rtype in gts:
                        gt = location * Relation_type + rtype
                        if gt in candidates:
                            # correct entity correct relation
                            TP[j][rtype] += 1
                            candidates = candidates[candidates != gt]
                        else:
                            # at least one is wrong
                            FN[j][rtype] += 1
                    for candidate in candidates:
                        gt_locations = [l for (l, rt) in gts]
                        if candidate // Relation_type in gt_locations:
                            # correct entity wrong relation, omit
                            continue
                        else:
                            # wrong entity
                            FP[j][candidate % Relation_type] += 1
                    #print(TP[j])
                    #print(FN[j])
                    #print(FP[j])
                    #input()'''

            # ignore NER
            if e_posi[i][0][0] > e_posi[i][1][0]:
                s = e_posi[i][0][0]
                gts = [
                    posi * Relation_type + r_label[i] for posi in e_posi[i][1]
                ]
                gtp = [posi for posi in e_posi[i][1]]
            else:
                s = e_posi[i][1][0]
                gts = [
                    posi * Relation_type + r_label[i] for posi in e_posi[i][0]
                ]
                gtp = [posi for posi in e_posi[i][0]]

            u = RE(ctx[i:i + 1, :s + 1, :])
            result = nn.Softmax(dim=-1)(u[0, :, :].view(-1))

            for j, th in enumerate(Relation_threshold):
                candidates = (result > th).nonzero()
                # print(candidates)
                for candidate in candidates:
                    if candidate in gts:
                        # correct entity correct relation
                        TP[j][r_label[i]] += 1
                    else:
                        # at least one is wrong
                        FN[j][r_label[i]] += 1
                        FP[j][candidate % Relation_type] += 1

    for j, th in enumerate(Relation_threshold):
        for r in range(Relation_type):
            F1[j][r] = (2 * TP[j][r] + epsilon) / (2 * TP[j][r] + FP[j][r] +
                                                   FN[j][r] + epsilon)
            Precision[j][r] = (TP[j][r] + epsilon) / (TP[j][r] + FP[j][r] +
                                                      epsilon)
            Recall[j][r] = (TP[j][r] + epsilon) / (TP[j][r] + FN[j][r] +
                                                   epsilon)
        total_F1[j] = np.average(np.array(F1[j]))
        micro_F1[j] = (2 * sum(TP[j]) + epsilon) / (
            2 * sum(TP[j]) + sum(FP[j]) + sum(FN[j]) + epsilon)
        total_F1_9[j] = np.average(np.array(F1[j][1:]))
        micro_F1_9[j] = (2 * sum(TP[j][1:]) + epsilon) / (
            2 * sum(TP[j][1:]) + sum(FP[j][1:]) + sum(FN[j][1:]) + epsilon)
        precision_9[j] = np.average(np.array(Precision[j][1:]))
        recall_9[j] = np.average(np.array(Recall[j][1:]))
        macro_F1_9[j] = (2 * recall_9[j] * precision_9[j] +
                         epsilon) / (recall_9[j] + precision_9[j] + epsilon)
        print('(threshold %.2f)' % th, flush=True)
        print('with other: ave F1: %.4f, micro F1: %.4f' %
              (total_F1[j], micro_F1[j]),
              flush=True)
        print(
            'without other: ave F1: %.4f, micro F1: %.4f, macro F1: %.4f, ave precision: %.4f, ave recall: %.4f'
            % (total_F1_9[j], micro_F1_9[j], macro_F1_9[j], precision_9[j],
               recall_9[j]),
            flush=True)

    with open(TEST_LOG_FILE, 'a+') as LogDump:
        LogWriter = csv.writer(LogDump)
        LogWriter.writerows(F1)
def predict(text, model):
    X = VECTORIZER.transform([tokenizer(text)])
    y = MODELS[model].predict(X)[0]
    return y
Beispiel #21
0
def test():
    test_ls = dir_reader(TEST_DIR)
    relations = relation_reader(cache=RELATIONS)
    assert relations['Other'] == 0
    assert Relation_type == len(relations)
    print(relations)

    test_data = tokenizer((test_ls, TEST_DIR),
                          relations,
                          pretrain_type='elmo_repre')
    print('%d test data' % len(test_data.data), flush=True)

    LSTM_layer = SeqLayer(ELMo_size, Hidden_size, Hidden_layer, Dropout,
                          Bidirection).cuda()
    NER = EntityDetect(Label_embed, Hidden_size, 3, Dropout).cuda()
    RE = RelationDetect(Hidden_size, Label_embed, Relation_type, Hidden_size,
                        Dropout).cuda()

    LSTM_layer.load_state_dict(torch.load(SAVE_DIR + 'LSTM_2_999'))
    NER.load_state_dict(torch.load(SAVE_DIR + 'NER_2_999'))
    RE.load_state_dict(torch.load(SAVE_DIR + 'RE_2_999'))

    print('network initialized', flush=True)

    if os.path.exists(TEST_LOG_FILE):
        os.remove(TEST_LOG_FILE)

    test_data.reset_epoch()
    LSTM_layer.eval()
    NER.eval()
    RE.eval()
    TP = [[0.] * Relation_type for _ in range(len(Relation_threshold))]
    FP = [[0.] * Relation_type for _ in range(len(Relation_threshold))]
    FN = [[0.] * Relation_type for _ in range(len(Relation_threshold))]
    F1 = [[0.] * Relation_type for _ in range(len(Relation_threshold))]
    total_F1 = [0.] * len(Relation_threshold)
    micro_F1 = [0.] * len(Relation_threshold)
    total_F1_9 = [0.] * len(Relation_threshold)
    micro_F1_9 = [0.] * len(Relation_threshold)
    precision_9 = [0.] * len(Relation_threshold)
    recall_9 = [0.] * len(Relation_threshold)
    count_all = 0
    correct_raw = 0

    while not test_data.epoch_finish:
        standard_emb, e_label, e_posi, r_label, seq_length, mask, seq_pos = test_data.get_batch(
            Batch_size)
        # print(standard_emb.size())
        # print(e_label)
        # print(e_posi, r_label, seq_length)
        # input()
        ctx = LSTM_layer(standard_emb, seq_length)

        label_emb = torch.zeros((Batch_size, max(seq_length), Label_embed),
                                requires_grad=False).cuda()
        y_out = torch.zeros(Batch_size, requires_grad=False).long().cuda()
        y_all = torch.zeros((Batch_size, max(seq_length)),
                            requires_grad=False).long().cuda()
        for s in range(max(seq_length)):
            v_tp, logit, y_out = NER(ctx[:, s, :], y_out)
            for i in range(Batch_size):
                y_all[i, s] = y_out[i].detach() if s < seq_length[i] else -1
                if s > 0 and s <= seq_length[i]:
                    label_emb[i, s - 1, :] = v_tp[i, :].detach(
                    )  # record embedding of label of last time step

        # get label embedding of the last step
        v_tp, _, _ = NER(torch.zeros(Batch_size, Hidden_size).cuda(), y_out)
        for i in range(Batch_size):
            if seq_length[i] == max(seq_length):
                label_emb[i, -1, :] = v_tp[i, :].detach()

        # print(y_all)
        # print(e_label)
        # print(label_emb[:, :, 0])
        # input()

        # compute entity detection accuracy
        for i in range(Batch_size):
            count_all += 1
            e1 = y_all[i, :seq_length[i]].nonzero()
            e2 = e_label[i, :seq_length[i]].nonzero()
            correct_raw += int(torch.equal(e1, e2))

        # get relationship
        for i in range(Batch_size):
            for s in range(1,
                           seq_length[i] + 1):  # s is the count of word number
                if s - 1 in e_posi[i][0] and e_posi[i][0][0] > e_posi[i][1][0]:
                    gts = [(posi, r_label[i]) for posi in e_posi[i][1]]
                elif s - 1 in e_posi[i][
                        1] and e_posi[i][1][0] > e_posi[i][0][0]:
                    gts = [(posi, r_label[i]) for posi in e_posi[i][0]]
                else:
                    gts = [((s - 1), 0)]

                u = RE(ctx[i:i + 1, :s, :], label_emb[i:i + 1, :s, :])
                result = nn.Softmax(dim=-1)(u[0, :, :].view(-1))

                for j, th in enumerate(Relation_threshold):
                    candidates = (result > th).nonzero()
                    for location, rtype in gts:
                        gt = location * Relation_type + rtype
                        if gt in candidates:
                            # correct entity correct relation
                            TP[j][rtype] += 1
                            candidates = candidates[candidates != gt]
                        elif gt not in candidates:
                            # at least one is wrong
                            FN[j][rtype] += 1
                    for candidate in candidates:
                        gt_locations = [l for (l, rt) in gts]
                        if candidate // Relation_type in gt_locations:
                            # correct entity wrong relation, omit
                            continue
                        else:
                            # wrong entity
                            FP[j][candidate % Relation_type] += 1

    print('NER raw accuracy: %.4f' % (correct_raw / count_all), flush=True)
    for j, th in enumerate(Relation_threshold):
        for r in range(Relation_type):
            F1[j][r] = (2 * TP[j][r] + epsilon) / (2 * TP[j][r] + FP[j][r] +
                                                   FN[j][r] + epsilon)
        total_F1[j] = np.average(np.array(F1[j]))
        micro_F1[j] = (2 * sum(TP[j]) + epsilon) / (
            2 * sum(TP[j]) + sum(FP[j]) + sum(FN[j]) + epsilon)
        total_F1_9[j] = np.average(np.array(F1[j][1:]))
        micro_F1_9[j] = (2 * sum(TP[j][1:]) + epsilon) / (
            2 * sum(TP[j][1:]) + sum(FP[j][1:]) + sum(FN[j][1:]) + epsilon)
        precision_9[j] = (sum(TP[j][1:]) + epsilon) / (
            sum(TP[j][1:]) + sum(FP[j][1:]) + epsilon)
        recall_9[j] = (sum(TP[j][1:]) + epsilon) / (sum(TP[j][1:]) +
                                                    sum(FN[j][1:]) + epsilon)
        print('(threshold %.2f)' % th, flush=True)
        print('with other: val ave F1: %.4f, val micro F1: %.4f' %
              (total_F1[j], micro_F1[j]),
              flush=True)
        print(
            'without other: val ave F1: %.4f, val micro F1: %.4f, precision: %.4f, recall: %.4f'
            % (total_F1_9[j], micro_F1_9[j], precision_9[j], recall_9[j]),
            flush=True)

    with open(TEST_LOG_FILE, 'a+') as LogDump:
        LogWriter = csv.writer(LogDump)
        LogWriter.writerows(F1)
Beispiel #22
0
        predictions.append(predicted_code.split())
    bleu = corpus_bleu(actual, predictions)
    return bleu, actual, predictions


if __name__ == '__main__':
    argv = sys.argv[1:]
    if len(argv) != 1:
        print('Need to supply an argument specifying model path')
        exit(0)
    model_path = argv[0]
    test_dir = '../data/test/'
    # model_path = '../results/'
    vocab_path = '../data/code.vocab'

    tokenizer = tokenizer(vocab_path)
    bleu, actual, predictions = evaluate_model(test_dir,
                                               model_path,
                                               tokenizer,
                                               CONTEXT_LENGTH,
                                               display=False)
    # Calculate BLEU score (standard is 4-gram, but just get all individual N-Gram BLEU scores from 1 gram to 4 gram)
    # By default, the sentence_bleu() and corpus_bleu() scores calculate the cumulative 4-gram BLEU score, also called BLEU-4.
    # It is common to report the cumulative BLEU-1 to BLEU-4 scores when describing the skill of a text generation system.
    # 4-gram is the most strict and corresponds the best to human translations
    print('BLEU-1: %f' %
          corpus_bleu(actual, predictions, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' %
          corpus_bleu(actual, predictions, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' %
          corpus_bleu(actual, predictions, weights=(0.3, 0.3, 0.3, 0)))
Beispiel #23
0
import numpy.linalg as linalg
import re

import pdb

USE_ALLENNLP = False
#use flag, as some users reported issues with installation.
if USE_ALLENNLP:
    import allennlp.data.tokenizers.word_tokenizer as tokenizer
    from allennlp.data.tokenizers.word_filter import StopwordFilter
    tk = tokenizer.WordTokenizer()
    stop_word_filter = StopwordFilter()
else:
    print('Note: using rudimentary tokenizer, for better results enable allennlp.')
    stop_word_filter = utils.stop_word_filter()
    tk = utils.tokenizer()
    
'''
Combines content and noise words embeddings
'''
def doc_word_embed_content_noise(content_path, noise_path, whiten_path=None, content_lines=None, noise_lines=None, opt=None):
    no_add_set = set()
    doc_word_embed_f = doc_word_embed_sen
    content_words_ar, content_word_embeds = doc_word_embed_f(content_path, no_add_set, content_lines=content_lines)
    words_set = set(content_words_ar)
    noise_words_ar, noise_word_embeds = doc_word_embed_f(noise_path, set(content_words_ar), content_lines=noise_lines)
    content_words_ar.extend(noise_words_ar)
    words_ar = content_words_ar
    word_embeds = torch.cat((content_word_embeds, noise_word_embeds), dim=0)
    
    whitening = opt.whiten if opt is not None else True  
    for post in data:
        # get title
        if count_iter == len(random_idx):
            break

        if count == random_idx[count_iter]:
            count_iter += 1

            # procesado para core | PRUEBA ENERO
            text = '\n'.join(
                [post.get('title', ''),
                 post.get('description', '')])
            if text == '':
                print("Wot, texto vacio!!")
            # Tokenize and assign filter tags
            tokens = tokenizer(text)

            # procesado para core_tokenized | PRUEBA DE 1000
            #tokens = post['tokens']

            filtered_tokens = filterTokens(tokens, word_dict_filtered)
            # Title map and doc_title mapping
            title = ' '.join(tokens[0])[:50]
            doc_title = 'doc%i' % count_iter
            title_map_file.write(doc_title + '\n')
            doc_title_map[doc_title] = title

            ## content to display
            content = [' '.join(sent) for sent in filtered_tokens]
            content = '<br>\n'.join(content)
            open(os.path.join(docs_dir, doc_title), 'w').write(content)
Beispiel #25
0
from solver import Solver
from data_loader import get_loader, get_vocab
from configs import get_config
from utils import tokenizer

if __name__ == '__main__':
    config = get_config(batch_size=1)
    print(config)

    data_loader = get_loader(batch_size=config.batch_size,
                             max_size=config.vocab_size,
                             is_train=False,
                             data_dir=config.data_dir)

    solver = Solver(config, data_loader)
    solver.build(is_train=False)
    solver.load(epoch=2)
    vocab = get_vocab()

    while True:
        text = input('Insert Sentence: ')
        text = tokenizer(text)
        text = [vocab.stoi[word] for word in text]

        prediction = solver.inference(text)

        if prediction == 0:
            print('Positive!')
        else:
            print('Negative')
Beispiel #26
0
 def build_tokenizer(self, tokenize='default'):
     self.indices_token, self.token_indices = tokenizer(mode=tokenize)
     self.n_chars = len(self.indices_token.keys())
Beispiel #27
0
import csv
import utils
sentences = []
with open(r'..\data\raw\corpus_raw.csv', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        text = utils.normalizer(row['SOSPECHA_DIAGNOSTICA'])
        tokens = utils.tokenizer(text)
        tokens.append('<END>')
        tokens.insert(0, '<START>')
        sentence = '!#!'.join(tokens)
        if len(sentence) > 0:
            sentences.append(sentence)
with open(r"..\data\processed\corpus.csv", "w", encoding='utf-8') as output:
    for sentence in sentences:
        output.write(sentence + '\n')

Beispiel #28
0
            total_fp += fp
            total_tn += tn
            total_fn += fn
            #print(str(tp)+" "+str(fp)+" "+str(tn)+" "+str(fn)+" ")
            #print("tp: "+str(total_tp)+"fp: "+str(total_fp)+"tn: "+str(total_tn)+"fn: "+str(total_fn))
            #print("\n")
            #matched_num += sum([1 for tag in pred if tag in true and tag["type"]!=0])
    precision = total_tp / (total_tp + total_fp + eps)
    recall = total_tp / (total_tp + total_fn + eps)
    #recall = (matched_num + eps) / (total_true + eps)
    f1 = 2 * precision * recall / (precision + recall + eps)
    print('P: %.4f  R: %.4f  F: %.4f' % (precision, recall, f1))
    with open(root + "/" + config["TEST_SET"], "r") as f:
        tokens = []
        for line in [l for l in f if not l.startswith("-DOCSTART-")]:
            words = utils.tokenizer(line)
            if words:
                word = words[0]
                tokens.append(word)
        f.close()
    with open(
            "risultati_opentag/esperimento#" + str(config["ESPERIMENTO"]) +
            ".txt", "w+") as f:
        f.truncate(0)
        for token, tag in zip(tokens, lines):
            f.write(token + tag)
        f.close()
        config["ESPERIMENTO"] += 1
        with open("config.json", "w") as c:
            json.dump(config, c)