def show_instances(class_name=''): name = 'train' if name == 'train': filename = Const.origin_all_train_filename elif name == 'dev': filename = Const.origin_all_dev_filename f = open(filename, 'r') content = f.readlines() html_doc = ' '.join(content) sentences_string, triples_string = utils.parse(html_doc) sentences_words = utils.sentence_tokenize(sentences_string) position_triples = utils.find_entity_position(sentences_words, triples_string) sentences_word_id, sentence_triples_id = utils.turn2id( sentences_words, position_triples) utils.triples_type(sentence_triples_id) if class_name == 'normal': func = utils.is_normal_triple elif class_name == 'single_entity_overlap': func = utils.is_over_lapping else: func = utils.is_multi_label words2id = utils.load_words2id() id2words = {v: k for k, v in words2id.items()} for sent_words_id, triples_id in zip(sentences_word_id, sentence_triples_id): if func(triples_id, is_relation_first=False): print ' '.join([id2words[x] for x in sent_words_id]) print triples_id print '-----------------------------------'
def _pre_process(news_file_name, output_file): """a very simple pre processing""" print("pre processing '%s'..." % news_file_name) with open(news_file_name) as f: content = f.readlines() print("writing into '%s'..." % output_file) with open(output_file, 'a') as f: # each line as a news article for news in content: news = news.strip() # if the news article is too short ignore it if len(news) < 150: continue # tokenize the news article sentence by sentence sentences = sentence_tokenize(news) # we need at least 6 lines, because we are going to remove the first and the last sentences # because they contain sometimes author, date and so on. if len(sentences) >= 6: for sentence in sentences[1:-1]: f.write(sentence) f.write('\n') # extra new line to separate each news articles f.write('\n')
def load_data(self, unlabeled_data, ids): self.message = {} self.ids = [] self.data_num = 0 for i in tqdm(ids): try: sentences = [] labels = [] doc = unlabeled_data[i] doc_len = [] sent_len = [] doc += '.' results = re.compile(r'http[a-zA-Z0-9.?/&=:#%_-]*', re.S) dd = results.sub(" <website> ", doc) results = re.compile(r'www.[a-zA-Z0-9.?/&=:#%_-]*', re.S) dd = results.sub(" <website> ", dd) results = re.compile( r'[a-zA-Z0-9.?/&=:#%_-]*.(com|net|org|io|gov|me|edu)', re.S) dd = results.sub(" <website> ", dd) sents = sentence_tokenize(dd) # print(sents) for j in range(0, min(len(sents), self.max_seq_num)): tokens = self.tokenizer.tokenize(dd) temp = tokens if len(temp) > 0: temp_ = ['[CLS]'] for k in range(0, min(len(temp), self.max_seq_len - 2)): temp_.append(temp[k]) temp_.append('[SEP]') sentences.append(temp_) labels.append(10) sent_len.append(len(temp_) - 1) doc_len.append(min(len(sents) - 1, self.max_seq_num - 1)) self.message[i] = (sentences[:self.max_seq_num], labels[:self.max_seq_num], sent_len[:self.max_seq_num], doc_len) self.ids.append(i) except: #print(doc) #exit() pass
def prepare(name): print name if name == 'train': filename = Const.origin_all_train_filename if name == 'dev': filename = Const.origin_all_dev_filename if name == 'example': filename = Const.origin_example_filename print Const.triple_len f = open(filename, 'r') print filename content = f.readlines() html_doc = ' '.join(content) sentences_string, triples_string = utils.parse(html_doc) sentences_words = utils.sentence_tokenize(sentences_string) position_triples = utils.find_entity_position(sentences_words, triples_string) sentences_word_id, sentence_triples_id = utils.turn2id( sentences_words, position_triples) if name == 'train': # split train file into train and valid set [valid_sentences_word_id, valid_sentence_triples_id ], [train_sentences_word_id, train_sentence_triples_id] = utils.split(sentences_word_id, sentence_triples_id) utils.static_triples_info(train_sentence_triples_id) utils.triples_type(train_sentence_triples_id) utils.static_triples_info(valid_sentence_triples_id) utils.triples_type(valid_sentence_triples_id) json.dump([train_sentences_word_id, train_sentence_triples_id], open(Const.train_filename, 'w')) json.dump([valid_sentences_word_id, valid_sentence_triples_id], open(Const.valid_filename, 'w')) utils.instances2nyt_style( [train_sentences_word_id, train_sentence_triples_id], Const.nyt_style_raw_train_filename) utils.instances2nyt_style( [valid_sentences_word_id, valid_sentence_triples_id], Const.nyt_style_raw_valid_filename) elif name == 'dev': utils.triples_type(sentence_triples_id) json.dump([sentences_word_id, sentence_triples_id], open(Const.dev_filename, 'w')) utils.instances2nyt_style([sentences_word_id, sentence_triples_id], Const.nyt_style_raw_test_filename) else: utils.triples_type(sentence_triples_id) json.dump([sentences_word_id, sentence_triples_id], open(Const.example_filename, 'w'))
def _pre_process(wiki_file_name): """a very simple pre processing""" print("pre processing '%s'..." % wiki_file_name) with open(wiki_file_name) as f: content = f.readlines() # some articles has only 1 line, so write first into an array later filter them articles = [[]] article_start = False # process line by line for line in content: line = line.strip() # ignore empty line if not line: continue # ignore article start if line.startswith('<doc'): article_start = True continue # after article start, there is the article title, ignore it if article_start: article_start = False continue # ignore categories if line.startswith('[['): continue if line.startswith('</doc>'): articles.append([]) continue # tokenize the news article sentence by sentence for sentence in sentence_tokenize(line): articles[-1].append(sentence) # we need articles with at least 2 lines articles = [d for d in articles if len(d) >= 2] # write into a single file with open(join(MN_CORPUS_FOLDER, MN_WIKI_FILE), 'a') as f: for article in articles: for sentence in article: f.write(sentence) f.write('\n') # extra new line to separate each news articles f.write('\n')
def statics(name): if name == 'train': filename = Const.origin_all_train_filename elif name == 'dev': filename = Const.origin_all_dev_filename f = open(filename, 'r', encoding='utf-8') content = f.readlines() html_doc = ' '.join(content) sentences_string, triples_string = utils.parse(html_doc) # 读取内容 sentences_words = utils.sentence_tokenize(sentences_string) # 分词 relations_words = utils.static_relations(triples_string) # 对关系进行分词 并建立关系到id的映射 sentences_words.extend(relations_words) # not only static the words in sentences, but also the words in relations words2id = utils.static_words(sentences_words) # 建立单词到id的映射 relations_words_id = [None] for r_words in relations_words: r_words_id = [utils.turn_word2id(w, words2id) for w in r_words] relations_words_id.append(r_words_id) json.dump(relations_words_id, open(Const.relations_words_id_filename, 'w', encoding='utf-8'), indent=False)
def prepare(name): print(name) if name == 'train': filename = Const.origin_all_train_filename if name == 'dev': filename = Const.origin_all_dev_filename if name == 'example': filename = Const.origin_example_filename print(Const.triple_len) f = open(filename, 'r', encoding='utf-8') print(filename) content = f.readlines() html_doc = ' '.join(content) if not os.path.isfile(name + "_parse.json"): sentences_string, triples_string = utils.parse(html_doc) json.dump([sentences_string, triples_string], open(name + '_parse.json', 'w', encoding='utf-8')) else: sentences_string, triples_string = json.load(open(name + '_parse.json', 'r', encoding='utf-8')) sentences_words = utils.sentence_tokenize(sentences_string) position_triples = utils.find_entity_position(sentences_words, triples_string) # 记录实体最后一个单词的索引(e1_end, e2_end, relation) sentences_word_id, sentence_triples_id = utils.turn2id(sentences_words, position_triples) # 将句子、关系类型转为id if name == 'train': # split train file into train and valid set [valid_sentences_word_id, valid_sentence_triples_id], [train_sentences_word_id, train_sentence_triples_id] = utils.split(sentences_word_id, sentence_triples_id) utils.static_triples_info(train_sentence_triples_id) utils.triples_type(train_sentence_triples_id) utils.static_triples_info(valid_sentence_triples_id) utils.triples_type(valid_sentence_triples_id) json.dump([train_sentences_word_id, train_sentence_triples_id], open(Const.train_filename, 'w', encoding='utf-8')) json.dump([valid_sentences_word_id, valid_sentence_triples_id], open(Const.valid_filename, 'w', encoding='utf-8')) utils.instances2nyt_style([train_sentences_word_id, train_sentence_triples_id], Const.nyt_style_raw_train_filename) utils.instances2nyt_style([valid_sentences_word_id, valid_sentence_triples_id], Const.nyt_style_raw_valid_filename) elif name == 'dev': utils.triples_type(sentence_triples_id) json.dump([sentences_word_id, sentence_triples_id], open(Const.dev_filename, 'w', encoding='utf-8')) utils.instances2nyt_style([sentences_word_id, sentence_triples_id], Const.nyt_style_raw_test_filename) else: utils.triples_type(sentence_triples_id) json.dump([sentences_word_id, sentence_triples_id], open(Const.example_filename, 'w', encoding='utf-8'))
def _process_section(section, main_class): content = section.get_content().decode("utf-8") soup = BeautifulSoup(content, 'html.parser') lines = [] children = soup.find('body').findChildren() for child in children: if child.get('class') is not None and main_class in child.get('class'): line = child.text.strip() if len(line) > 0: lines.append(line) else: lines.append(EMPTY_LINE) if len(lines) >= 2: lines = _unwrap_lines(lines) lines = [_process_line(line) for line in lines] lines.append(EMPTY_LINE) sentences = [] for line in lines: sentences += sentence_tokenize(line) return sentences
def load_data(self, unlabeled_data, ids): self.message = {} self.ids = [] self.data_num = 0 for i in ids: try: sentences = [] labels = [] doc = unlabeled_data[i] doc_len = [] sent_len = [] doc += '.' results = re.compile(r'http[a-zA-Z0-9.?/&=:#%_-]*', re.S) dd = results.sub(" <website> ", doc) results = re.compile(r'www.[a-zA-Z0-9.?/&=:#%_-]*', re.S) dd = results.sub(" <website> ", dd) results = re.compile( r'[a-zA-Z0-9.?/&=:#%_-]*.(com|net|org|io|gov|me|edu)', re.S) dd = results.sub(" <website> ", dd) sents = sentence_tokenize(dd) # print(sents) for j in range(0, len(sents)): a = regexp_tokenize(transform_format(sents[j]), self.pattern) temp = [] for k in range(0, len(a)): if a[k] not in self.english_punctuations and check_ack_word( a[k]) == 1: if a[k].isdigit(): a[k] = '<number>' elif a[k][0] == '$': a[k] = '<money>' elif a[k][-1] == '%': a[k] = '<percentage>' temp.append(a[k].lower()) if len(temp) > 0: temp_ = ['<sos>'] for k in range(0, min(len(temp), self.max_seq_len - 2)): temp_.append(temp[k]) temp_.append('<eos>') sentences.append(temp_) labels.append(10) sent_len.append(len(temp_) - 1) doc_len.append(min(len(sents) - 1, self.max_seq_num - 1)) self.message[i] = (sentences[:self.max_seq_num], labels[:self.max_seq_num], sent_len[:self.max_seq_num], doc_len) self.ids.append(i) except: #print(doc) #exit() pass
def build_vocab(self, unlabeled_data, labeled_data, embedding_size, max_seq_num, max_seq_len): sentences = [] words = [] if unlabeled_data is not None: for (u, v) in unlabeled_data.items(): try: results = re.compile(r'http[a-zA-Z0-9.?/&=:#%_-]*', re.S) dd = results.sub(" <website> ", v) results = re.compile(r'www.[a-zA-Z0-9.?/&=:#%_-]*', re.S) dd = results.sub(" <website> ", dd) results = re.compile( r'[a-zA-Z0-9.?/&=:#%_-]*.(com|net|org|io|gov|me|edu)', re.S) dd = results.sub(" <website> ", dd) sents = sentence_tokenize(dd) for j in range(0, len(sents)): a = regexp_tokenize(transform_format(sents[j]), self.pattern) temp = [] for k in range(0, len(a)): if a[k] not in self.english_punctuations and check_ack_word( a[k]) == 1: if a[k].isdigit(): a[k] = '<number>' elif a[k][0] == '$': a[k] = '<money>' elif a[k][-1] == '%': a[k] = '<percentage>' temp.append(a[k].lower()) words.append(a[k].lower()) if len(temp) > 0: sentences.append(temp) except: #print(u,v) #exit() pass if labeled_data is not None: for (u, v) in labeled_data.items(): for i in range(0, len(v[0])): v[0][i] = str(v[0][i]) try: results = re.compile(r'http[a-zA-Z0-9.?/&=:#%_-]*', re.S) dd = results.sub(" <website> ", v[0][i]) results = re.compile(r'www.[a-zA-Z0-9.?/&=:#%_-]*', re.S) dd = results.sub(" <website> ", dd) results = re.compile( r'[a-zA-Z0-9.?/&=:#%_-]*.(com|net|org|io|gov|me|edu)', re.S) dd = results.sub(" <website> ", dd) except: print(u, v) print(v[0][i]) exit() a = regexp_tokenize(transform_format(dd), self.pattern) temp = [] for k in range(0, len(a)): if a[k] not in self.english_punctuations and check_ack_word( a[k]) == 1: if a[k].isdigit(): a[k] = '<number>' elif a[k][0] == '$': a[k] = '<money>' elif a[k][-1] == '%': a[k] = '<percentage>' temp.append(a[k].lower()) words.append(a[k].lower()) if len(temp) > 0: sentences.append(temp) word_frequency = {} for i in range(0, len(words)): if words[i] in word_frequency: word_frequency[words[i]] += 1 else: word_frequency[words[i]] = 1 self.model = gensim.models.Word2Vec(sentences, size=embedding_size, window=5, min_count=1, iter=20, negative=50) x = 4 self.word2id['<pad>'] = 0 self.id2word[0] = '<pad>' self.word2id['<sos>'] = 2 self.id2word[2] = '<sos>' self.word2id['<eos>'] = 3 self.id2word[3] = '<eos>' self.unk_count = 0 for i in range(0, len(sentences)): for j in range(0, len(sentences[i])): if word_frequency[sentences[i][j].lower()] >= 2: if sentences[i][j].lower() in self.model: if sentences[i][j].lower() in self.word2id: pass else: self.word2id[sentences[i][j].lower()] = x self.id2word[x] = sentences[i][j].lower() x = x + 1 else: self.word2id['<unk>'] = 1 self.id2word[1] = '<unk>' self.unk_count += 1