def parse_wiki_text_cropped(string): """Parse wiki text and return cropped paragraphs (paras of length > 2).""" if string is None: return "" is_malformed = False string, is_malformed = first_pass(string) if is_malformed: return "", "", True string = second_pass(string) string = string.strip() result = [] cropped = [] for segment in string.split("\n\n"): if len(segment) > 10000: is_malformed = True break add_segment = utils.split_sentences(segment) result += add_segment result.append("") if len(add_segment) > 2: cropped += add_segment cropped.append("") if len(cropped) > 0 and cropped[-1] == "": cropped.pop() if len(result) > 0 and result[-1] == "": result.pop() return result, cropped, is_malformed
def parse_wiki_text(string, get_cropped=False): """Parse wiki text.""" if string is None: return "" is_malformed = False string, is_malformed = first_pass(string) if is_malformed: return "", True string = second_pass(string) string = string.strip() result = [] for segment in string.split("\n\n"): if len(segment) > 10000: is_malformed = True break result += utils.split_sentences(segment) result.append("") if len(result) > 0 and result[-1] == "": result.pop() return result, is_malformed
def parse_wiki_text_cropped(string): """Parse wiki text and return cropped paragraphs (paras of length > 2).""" if string is None: return "" is_malformed = False string, is_malformed = first_pass(string) if is_malformed: return "", "", True string = second_pass(string) string = string.strip() result = [] cropped = [] for segment in string.split("\n\n"): if len(segment) > 10000: is_malformed = True break add_segment = utils.split_sentences(segment) result += add_segment result.append("") if len(add_segment) > 2: cropped += add_segment cropped.append("") if len(cropped) > 0 and cropped[-1] == "": cropped.pop() if len(result) > 0 and result[-1] == "": result.pop() return result, cropped, is_malformed
def parse_wiki_text(string, get_cropped=False): """Parse wiki text.""" if string is None: return "" is_malformed = False string, is_malformed = first_pass(string) if is_malformed: return "", True string = second_pass(string) string = string.strip() result = [] for segment in string.split("\n\n"): if len(segment) > 10000: is_malformed = True break result += utils.split_sentences(segment) result.append("") if len(result) > 0 and result[-1] == "": result.pop() return result, is_malformed
def __init__(self, content, source): self.content = content self.source = source self.sentences = split_sentences(content) self.vector = vectorize_text(content) self.word_scores = {} self.summary = summarize(self.content, 7) self.init_word_scores()
def _extract(self, text): def extract_from_sentence(sentence): return filter(lambda w: w in self.ranked_words, jieba.lcut(sentence)) keywords = set() for sentence in split_sentences(text): keywords.update(extract_from_sentence(sentence)) return keywords
def plan(self, text): def extract(sentence): return [x for x in jieba.lcut(sentence) if x in self.ranks] keywords = sorted(reduce(lambda x,y:x+y, list(map(extract, split_sentences(text))), []), key = lambda x : self.ranks[x]) words = [keywords[idx] for idx in \ [i for i in range(len(keywords)) if 0 == i or keywords[i] != keywords[i-1]]] if len(words) < 2: self.expand(words, 2) else: while len(words) > 2: words.pop() return words
def plan(self, text): def extract(sentence): return list(filter(lambda x: x in self.ranks, jieba.lcut(sentence))) txts = reduce(lambda x, y: x + y, map(extract, split_sentences(text))) keywords = sorted(txts, key=cmp_to_key(self.cmp)) words = [keywords[idx] for idx in \ filter(lambda i: 0 == i or keywords[i] != keywords[i-1], range(len(keywords)))] if len(words) < 4: self.expand(words, 4) else: while len(words) > 4: words.pop() return words
def plan(self, text): def extract(sentence): return list(filter(lambda x: x in self.ranks, jieba.lcut(sentence))) def get_ranks(line): return self.ranks[line] A = reduce(lambda x, y: x + y, map(extract, split_sentences(text)), []) keywords = sorted(A, key=get_ranks, reverse=True) words = [keywords[idx] for idx in \ list(filter(lambda i: 0 == i or keywords[i] != keywords[i-1], range(len(keywords))))] if len(words) < 4: self.expand(words, 4) else: while len(words) > 4: words.pop() #移除最后一个元素 return words
def plan(self, text): def extract(sentence): return filter(lambda x: x in self.ranks, jieba.lcut(sentence)) keywords = sorted(reduce(lambda x, y: x + y, map(extract, split_sentences(text)), []), cmp=lambda x, y: cmp(self.ranks[x], self.ranks[y])) words = [keywords[idx] for idx in \ filter(lambda i: 0 == i or keywords[i] != keywords[i-1], range(len(keywords)))] if len(words) < 4: self.expand(words, 4) else: while len(words) > 4: words.pop() return words
def plan(self, text): possible_result = split_sentences(text) for i in range(len(possible_result)): target = self.translater.translate(possible_result[i]) if target != "BAD TAG !": possible_result[i] = target else: possible_result.pop(i) keywords = sorted(possible_result, key=lambda x: self.ranks[x]) words = [keywords[idx] for idx in \ [i for i in range(len(keywords)) if 0 == i or keywords[i] != keywords[i-1]]] if len(words) < 2: self.expand(words, 2) else: while len(words) > 2: words.pop() return words
def __init__(self): self.poems = list() for corpus_name in _corpus_list: corpuspath = os.path.join(raw_dir, corpus_name) if not check_uptodate(corpuspath): _gen_poems() if not check_uptodate(poems_path): _gen_poems() for corpus_name in _corpus_list: corpuspath = os.path.join(raw_dir, corpus_name) with open(corpuspath, 'r') as fr: for line in fr: # print(line) sentences = split_sentences( line.strip('\r\n ').split()[-1]) # print(sentences) self.poems.append(sentences) print('self.poems==>', len(self.poems))
def _gen_poems(): print("Parsing poems ...") char_dict = CharDict() with open(poems_path, 'w') as fout: for corpus in _corpus_list: with open(os.path.join(raw_dir, corpus), 'r') as fin: for line in fin.readlines()[1:]: sentences = split_sentences(line.strip().split()[-1]) all_char_in_dict = True for sentence in sentences: for ch in sentence: if char_dict.char2int(ch) < 0: all_char_in_dict = False break if not all_char_in_dict: break if all_char_in_dict: fout.write(' '.join(sentences) + '\n') print("Finished parsing %s." % corpus)
def _parse_corpus(raw_file, json_file): print("Parsing %s ..." % raw_file, end=' ') #use in linux #sys.stdout.flush() rdict = RhymeUtil() data = [] with codecs.open(raw_file, 'r', 'utf-8') as fin: tags = fin.readline().strip().split('\t') line = fin.readline().strip() while line: toks = line.split('\t') poem = {'source': os.path.basename(raw_file)} for idx, tok in enumerate(toks): if tags[idx] != 'body': poem[tags[idx]] = tok else: body = tok flag = True left = body.find('(') while left >= 0: right = body.find(')') if right < left: flag = False break else: body = body[:left] + body[right + 1:] left = body.find('(') if flag and body.find(')') < 0: poem['sentences'] = split_sentences(body) for sentence in poem['sentences']: if not reduce(lambda x, ch: x and rdict.has_char(ch), sentence, True): flag = False break if flag: data.append(poem) line = fin.readline().strip() with codecs.open(json_file, 'w', 'utf-8') as fout: json.dump(data, fout) print("Done (%d poems)" % len(data)) return data
def _gen_poems(): print("Parsing poems ...") word_dict = wordDict() with open(poems_path, 'w', encoding='utf-8') as fout: for corpus in _corpus_list: with open(os.path.join(raw_dir, corpus), 'r', encoding='utf-8') as fin: for line in fin.readlines(): sentences = split_sentences(line) if len(sentences[0].split()) != 3: continue all_word_in_dict = True for sentence in sentences: sentence = sentence.strip().split() for ch in sentence: if word_dict.word2int(ch) < 0: all_word_in_dict = False break if not all_word_in_dict: break if all_word_in_dict: fout.write('|'.join(sentences) + '\n') print("Finished parsing %s." % corpus)
def _gen_poems(): print("Parsing poems ...") chardict = CharDict() corpus = list() for corpus_name in _corpus_list: corpuspath = os.path.join(raw_dir, corpus_name) with open(corpuspath, 'r') as fr: for index, line in enumerate(fr): if index == 0: continue all_in_char = True sentences = split_sentences(line.split()[3]) for sentence in sentences: for char in sentence: if chardict[char] < 0: all_in_char = False # raise ValueError('char\t{}\t不在char_dict里边??'.format(char)) if all_in_char: corpus.append(sentences) corpus_sorted = sorted(corpus, key=lambda x: (-len(x[0]), -len(x))) with open(poems_path, 'w') as fw: for sentences in corpus_sorted: fw.write(' '.join(sentences) + '\n') print("Finished parsing %s." % corpus)
def segment(self, sentence): # TODO: try CRF-based segmentation. toks = [] idx = 0 while idx + 4 <= len(sentence): # Cut 2 chars each time. if sentence[idx:idx + 2] in self.sxhy_dict: toks.append(sentence[idx:idx + 2]) else: for tok in jieba.lcut(sentence[idx:idx + 2]): toks.append(tok) idx += 2 # Cut last 3 chars. if idx < len(sentence): if sentence[idx:] in self.sxhy_dict: toks.append(sentence[idx:]) else: for tok in jieba.lcut(sentence[idx:]): toks.append(tok) return toks # For testing purpose. if __name__ == '__main__': segmenter = Segmenter() with open(os.path.join(raw_dir, 'qts_tab.txt'), 'r') as fin: for line in fin.readlines()[1:6]: for sentence in split_sentences(line.strip().split()[3]): print(' '.join(segmenter.segment(sentence)))
def extract_relation_each_file(source_filepath, ere_filepath, annotation_filepath, part_name, with_none): relation_records_each_file = [] source_fp = open(source_filepath) all_source_text = source_fp.read().decode("utf-8") # 注意编码 source_fp.close() sentences = split_sentences(all_source_text) # 分句 ere_file = xml.dom.minidom.parse(ere_filepath) ere_root = ere_file.documentElement relation_list = ere_root.getElementsByTagName('relation') entity_list = ere_root.getElementsByTagName('entity') filler_list = ere_root.getElementsByTagName('filler') annotation_file = xml.dom.minidom.parse(annotation_filepath) annotation_root = annotation_file.documentElement annotation_belief_list = annotation_root.getElementsByTagName( 'belief_annotations') annotation_belief_list = annotation_belief_list[0] annotation_relation_list = annotation_belief_list.getElementsByTagName( 'relation') # 实际上是relation_mention for i in range(len(relation_list)): # relation信息 relation_id = relation_list[i].getAttribute('id') relation_type = relation_list[i].getAttribute('type') relation_subtype = relation_list[i].getAttribute('subtype') relation_mention_list = relation_list[i].getElementsByTagName( 'relation_mention') for j in range(len(relation_mention_list)): # relation mention信息 relation_mention_id = relation_mention_list[j].getAttribute('id') relation_mention_realis = relation_mention_list[j].getAttribute( 'realis') # polarity for k in range((len(annotation_relation_list))): annotation_relation_id = annotation_relation_list[ k].getAttribute('ere_id') if annotation_relation_id == relation_mention_id: be_em = annotation_relation_list[k].getElementsByTagName( 'belief') # if len(st_em) == 0: # logger.info("错误:无情感标签。" + " " + part_name + " " + annotation_relation_id + # " " + relation_mention_id) label_type = be_em[0].getAttribute('type') break if with_none is False and label_type == 'na': break # 如果为none则丢弃该样本 # rel_arg是entity # 基本信息 rel_arg1 = relation_mention_list[j].getElementsByTagName( 'rel_arg1') rel_arg1 = rel_arg1[0] rel_arg1_id = rel_arg1.getAttribute('entity_id') rel_arg1_mention_id = rel_arg1.getAttribute('entity_mention_id') rel_arg1_role = rel_arg1.getAttribute('role') rel_arg1_text = rel_arg1.firstChild.data # 所属entity及entity mention信息 rel_arg1_entity_type, rel_arg1_entity_specificity, rel_arg1_mention_noun_type, rel_arg1_mention_offset, \ rel_arg1_mention_length, rel_arg1_context, rel_arg1_window_text = rel_arg_entity_info(entity_list, rel_arg1_id, rel_arg1_mention_id, rel_arg1_text, sentences) # rel_arg,同上 rel_arg2 = relation_mention_list[j].getElementsByTagName( 'rel_arg2') rel_arg2 = rel_arg2[0] rel_arg2_role = rel_arg2.getAttribute('role') rel_arg2_text = rel_arg2.firstChild.data rel_arg2_id = rel_arg2.getAttribute('entity_id') if rel_arg2_id != '': rel_arg2_mention_id = rel_arg2.getAttribute( 'entity_mention_id') # 所属entity及entity mention信息 rel_arg2_entity_type, rel_arg2_entity_specificity, rel_arg2_mention_noun_type, rel_arg2_mention_offset, \ rel_arg2_mention_length, rel_arg2_context, rel_arg2_window_text = rel_arg_entity_info(entity_list, rel_arg2_id, rel_arg2_mention_id, rel_arg2_text, sentences) rel_arg2_is_filler = 0 else: # rel_arg2有的不是entity是filler,先简单处理 rel_arg2_is_filler = 1 rel_arg2_id = rel_arg2.getAttribute('filler_id') # if rel_arg2_id == '': # logger.info("错误:参数不是entity或filler。" + " " + part_name + " " + relation_mention_id) rel_arg2_entity_type, rel_arg2_mention_offset, rel_arg2_mention_length, rel_arg2_context, rel_arg2_window_text = \ rel_arg_filler_info(filler_list, rel_arg2_id, rel_arg2_text, sentences) rel_arg2_mention_id = '' rel_arg2_entity_specificity = '' rel_arg2_mention_noun_type = '' # trigger trigger = relation_mention_list[j].getElementsByTagName( 'trigger') # ?待查 if len(trigger) == 0: trigger_offset = 0 trigger_length = 0 trigger_text = "" trigger_context = "" else: trigger = trigger[0] trigger_offset = int(trigger.getAttribute('offset')) trigger_length = int(trigger.getAttribute('length')) trigger_text = trigger.firstChild.data # 上下文信息 above = 0 below = 0 # 可调,考虑trigger中at等词较多,似乎不宜太长上下文,这里先只提取当前句子 trigger_context_dict = find_context(trigger_offset, sentences, trigger_text, above, below) # 拼成一个字符串 trigger_context = context_dict_to_string( trigger_context_dict, above, below) # actual source source = be_em[0].getElementsByTagName('source') if label_type == 'na' or len(source) == 0: source_id = '' source_offset = 0 source_length = 0 source_text = '' else: source = source[0] source_id = source.getAttribute('ere_id') source_offset = int(source.getAttribute('offset')) source_length = int(source.getAttribute('length')) source_text = source.firstChild.data relation_record = { 'file': part_name, 'relation_id': relation_id, 'relation_type': relation_type, 'relation_subtype': relation_subtype, 'relation_mention_id': relation_mention_id, 'relation_mention_realis': relation_mention_realis, 'rel_arg1_id': rel_arg1_id, 'rel_arg1_mention_id': rel_arg1_mention_id, 'rel_arg1_role': rel_arg1_role, 'rel_arg1_text': rel_arg1_text, 'rel_arg1_entity_type': rel_arg1_entity_type, 'rel_arg1_entity_specificity': rel_arg1_entity_specificity, 'rel_arg1_mention_noun_type': rel_arg1_mention_noun_type, 'rel_arg1_mention_offset': rel_arg1_mention_offset, 'rel_arg1_mention_length': rel_arg1_mention_length, 'rel_arg1_context': rel_arg1_context, 'rel_arg1_window_text': rel_arg2_window_text, 'rel_arg2_id': rel_arg2_id, 'rel_arg2_mention_id': rel_arg2_mention_id, 'rel_arg2_role': rel_arg2_role, 'rel_arg2_text': rel_arg2_text, 'rel_arg2_entity_type': rel_arg2_entity_type, 'rel_arg2_entity_specificity': rel_arg2_entity_specificity, 'rel_arg2_mention_noun_type': rel_arg2_mention_noun_type, 'rel_arg2_mention_offset': rel_arg2_mention_offset, 'rel_arg2_mention_length': rel_arg2_mention_length, 'rel_arg2_context': rel_arg2_context, 'rel_arg2_is_filler': rel_arg2_is_filler, 'rel_arg2_window_text': rel_arg2_window_text, 'trigger_offset': trigger_offset, 'trigger_length': trigger_length, 'trigger_text': trigger_text, 'trigger_context': trigger_context, 'label_type': label_type, 'source_id': source_id, 'source_offset': source_offset, 'source_length': source_length, 'source_text': source_text } relation_records_each_file.append(relation_record) return relation_records_each_file
def extract_event_each_file(source_filepath, ere_filepath, annotation_filepath, part_name, with_none): event_records_each_file = [] em_args_each_file = [] source_fp = open(source_filepath) all_source_text = source_fp.read().decode("utf-8") # 注意编码 source_fp.close() sentences = split_sentences(all_source_text) # 分句 ere_file = xml.dom.minidom.parse(ere_filepath) ere_root = ere_file.documentElement hopper_list = ere_root.getElementsByTagName('hopper') entity_list = ere_root.getElementsByTagName('entity') filler_list = ere_root.getElementsByTagName('filler') annotation_file = xml.dom.minidom.parse(annotation_filepath) annotation_root = annotation_file.documentElement annotation_belief_list = annotation_root.getElementsByTagName( 'belief_annotations') annotation_belief_list = annotation_belief_list[0] annotation_event_list = annotation_belief_list.getElementsByTagName( 'event') for i in range(len(hopper_list)): # hopper信息 hopper_id = hopper_list[i].getAttribute('id') event_mention_list = hopper_list[i].getElementsByTagName( 'event_mention') for j in range(len(event_mention_list)): # event信息 event_mention_id = event_mention_list[j].getAttribute('id') event_mention_type = event_mention_list[j].getAttribute('type') event_mention_subtype = event_mention_list[j].getAttribute( 'subtype') event_mention_realis = event_mention_list[j].getAttribute('realis') event_mention_ways = event_mention_list[j].getAttribute('ways') # polarity for k in range((len(annotation_event_list))): annotation_event_id = annotation_event_list[k].getAttribute( 'ere_id') if annotation_event_id == event_mention_id: be_em = annotation_event_list[k].getElementsByTagName( 'belief') # if len(st_em) == 0: # logger.info("错误:无情感标签。" + " " + part_name + " " + annotation_event_id + # " " + event_mention_id) label_type = be_em[0].getAttribute('type') break if with_none is False and label_type == 'none': break # 如果为none则丢弃该样本 # trigger trigger = event_mention_list[j].getElementsByTagName('trigger') trigger = trigger[0] trigger_offset = int(trigger.getAttribute('offset')) trigger_length = int(trigger.getAttribute('length')) trigger_text = trigger.firstChild.data # 上下文信息 above = 3 below = 3 # 可调 trigger_context_dict = find_context(trigger_offset, sentences, trigger_text, above, below) # 拼成一个字符串 trigger_context = context_dict_to_string(trigger_context_dict, above, below) # 从上下文中进一步提取窗口词 window_length = 10 sen = trigger_context_dict[0]['text'] sen_offset = trigger_context_dict[0]['offset'] window_text = get_window_text(window_length, sen, sen_offset, trigger_text, trigger_offset) # em_arg em_args = event_mention_list[j].getElementsByTagName('em_arg') em_arg_num = len(em_args) # print em_arg_num # 一般不超过4个 for em_arg in em_args: em_arg_role = em_arg.getAttribute('role') em_arg_text = em_arg.firstChild.data em_arg_id = em_arg.getAttribute('entity_id') if em_arg_id != "": # 是entity em_arg_mention_id = em_arg.getAttribute( 'entity_mention_id') # 所属entity及entity mention信息 em_arg_entity_type, em_arg_entity_specificity, em_arg_mention_noun_type, em_arg_mention_offset, \ em_arg_mention_length, em_arg_context = em_arg_entity_info(entity_list, em_arg_id, em_arg_mention_id, em_arg_text, sentences) em_arg_is_filler = 0 # 否 else: em_arg_id = em_arg.getAttribute('filler_id') # if em_arg_id == "": # logger.info("错误:参数不是entity或filler。" + " " + part_name + " " + event_mention_id) em_arg_entity_type, em_arg_mention_offset, em_arg_mention_length, em_arg_context = \ em_arg_filler_info(filler_list, em_arg_id, em_arg_text, sentences) em_arg_mention_id = "" em_arg_entity_specificity = "" em_arg_mention_noun_type = "" em_arg_is_filler = 1 em_arg_record = { 'file': part_name, 'hopper_id': hopper_id, 'event_mention_id': event_mention_id, 'em_arg_id': em_arg_id, 'em_arg_mention_id': em_arg_mention_id, 'em_arg_role': em_arg_role, 'em_arg_text': em_arg_text, 'em_arg_entity_type': em_arg_entity_type, 'em_arg_entity_specificity': em_arg_entity_specificity, 'em_arg_mention_noun_type': em_arg_mention_noun_type, 'em_arg_mention_offset': em_arg_mention_offset, 'em_arg_mention_length': em_arg_mention_length, 'em_arg_context': em_arg_context, 'em_arg_is_filler': em_arg_is_filler } em_args_each_file.append(em_arg_record) # actual source source = be_em[0].getElementsByTagName('source') if label_type == 'na' or len(source) == 0: source_id = '' source_offset = 0 source_length = 0 source_text = '' else: source = source[0] source_id = source.getAttribute('ere_id') source_offset = int(source.getAttribute('offset')) source_length = int(source.getAttribute('length')) source_text = source.firstChild.data event_record = { 'file': part_name, 'hopper_id': hopper_id, 'event_mention_id': event_mention_id, 'event_mention_type': event_mention_type, 'event_mention_subtype': event_mention_subtype, 'event_mention_realis': event_mention_realis, 'event_mention_ways': event_mention_ways, 'trigger_offset': trigger_offset, 'trigger_length': trigger_length, 'trigger_text': trigger_text, 'trigger_context': trigger_context, 'trigger_window_text': window_text, 'em_arg_num': em_arg_num, 'label_type': label_type, 'source_id': source_id, 'source_offset': source_offset, 'source_length': source_length, 'source_text': source_text } event_records_each_file.append(event_record) return event_records_each_file, em_args_each_file