Ejemplo n.º 1
0
def parse_wiki_text_cropped(string):
    """Parse wiki text and return cropped paragraphs (paras of length > 2)."""
    if string is None:
        return ""

    is_malformed = False
    string, is_malformed = first_pass(string)

    if is_malformed:
        return "", "", True

    string = second_pass(string)
    string = string.strip()
    result = []
    cropped = []

    for segment in string.split("\n\n"):
        if len(segment) > 10000:
            is_malformed = True
            break
        add_segment = utils.split_sentences(segment)
        result += add_segment
        result.append("")

        if len(add_segment) > 2:
            cropped += add_segment
            cropped.append("")

    if len(cropped) > 0 and cropped[-1] == "":
        cropped.pop()
    if len(result) > 0 and result[-1] == "":
        result.pop()

    return result, cropped, is_malformed
Ejemplo n.º 2
0
def parse_wiki_text(string, get_cropped=False):
    """Parse wiki text."""
    if string is None:
        return ""
    
    is_malformed = False
    string, is_malformed = first_pass(string)
    
    if is_malformed:
        return "", True
    
    string = second_pass(string)
    string = string.strip()
    result = []

    for segment in string.split("\n\n"):
        if len(segment) > 10000:
            is_malformed = True
            break
        result += utils.split_sentences(segment)
        result.append("")
    
    if len(result) > 0 and result[-1] == "":
        result.pop()

    return result, is_malformed
Ejemplo n.º 3
0
def parse_wiki_text_cropped(string):
    """Parse wiki text and return cropped paragraphs (paras of length > 2)."""
    if string is None:
        return ""
  
    is_malformed = False
    string, is_malformed = first_pass(string)
  
    if is_malformed:
        return "", "", True
    
    string = second_pass(string)
    string = string.strip()
    result = []
    cropped = []

    for segment in string.split("\n\n"):
        if len(segment) > 10000:
            is_malformed = True
            break
        add_segment = utils.split_sentences(segment)
        result += add_segment
        result.append("")

        if len(add_segment) > 2:
            cropped += add_segment
            cropped.append("")

    if len(cropped) > 0 and cropped[-1] == "":
        cropped.pop()
    if len(result) > 0 and result[-1] == "":
        result.pop()

    return result, cropped, is_malformed
Ejemplo n.º 4
0
def parse_wiki_text(string, get_cropped=False):
    """Parse wiki text."""
    if string is None:
        return ""

    is_malformed = False
    string, is_malformed = first_pass(string)

    if is_malformed:
        return "", True

    string = second_pass(string)
    string = string.strip()
    result = []

    for segment in string.split("\n\n"):
        if len(segment) > 10000:
            is_malformed = True
            break
        result += utils.split_sentences(segment)
        result.append("")

    if len(result) > 0 and result[-1] == "":
        result.pop()

    return result, is_malformed
Ejemplo n.º 5
0
	def __init__(self, content, source):
		self.content = content
		self.source = source
		self.sentences = split_sentences(content)
		self.vector = vectorize_text(content)
		self.word_scores = {}
		self.summary = summarize(self.content, 7)
		self.init_word_scores()
Ejemplo n.º 6
0
    def _extract(self, text):
        def extract_from_sentence(sentence):
            return filter(lambda w: w in self.ranked_words,
                          jieba.lcut(sentence))

        keywords = set()
        for sentence in split_sentences(text):
            keywords.update(extract_from_sentence(sentence))
        return keywords
Ejemplo n.º 7
0
 def plan(self, text):
     def extract(sentence):
         return [x for x in jieba.lcut(sentence) if x in self.ranks]
     keywords = sorted(reduce(lambda x,y:x+y, list(map(extract, split_sentences(text))), []),
         key = lambda x : self.ranks[x])
     words = [keywords[idx] for idx in \
             [i for i in range(len(keywords)) if 0 == i or keywords[i] != keywords[i-1]]]
     if len(words) < 2:
         self.expand(words, 2)
     else:
         while len(words) > 2:
             words.pop()
     return words
Ejemplo n.º 8
0
    def plan(self, text):
        def extract(sentence):
            return list(filter(lambda x: x in self.ranks,
                               jieba.lcut(sentence)))

        txts = reduce(lambda x, y: x + y, map(extract, split_sentences(text)))
        keywords = sorted(txts, key=cmp_to_key(self.cmp))
        words = [keywords[idx] for idx in \
                filter(lambda i: 0 == i or keywords[i] != keywords[i-1], range(len(keywords)))]
        if len(words) < 4:
            self.expand(words, 4)
        else:
            while len(words) > 4:
                words.pop()
        return words
Ejemplo n.º 9
0
 def plan(self, text):
     def extract(sentence):
         return list(filter(lambda x: x in self.ranks, jieba.lcut(sentence)))
     def get_ranks(line):
         return self.ranks[line]
     A = reduce(lambda x, y: x + y, map(extract, split_sentences(text)), [])
     keywords = sorted(A, key=get_ranks, reverse=True)
     words = [keywords[idx] for idx in \
             list(filter(lambda i: 0 == i or keywords[i] != keywords[i-1], range(len(keywords))))]
     if len(words) < 4:
         self.expand(words, 4)
     else:
         while len(words) > 4:
             words.pop()              #移除最后一个元素
     return words
Ejemplo n.º 10
0
    def plan(self, text):
        def extract(sentence):
            return filter(lambda x: x in self.ranks, jieba.lcut(sentence))

        keywords = sorted(reduce(lambda x, y: x + y,
                                 map(extract, split_sentences(text)), []),
                          cmp=lambda x, y: cmp(self.ranks[x], self.ranks[y]))
        words = [keywords[idx] for idx in \
                filter(lambda i: 0 == i or keywords[i] != keywords[i-1], range(len(keywords)))]
        if len(words) < 4:
            self.expand(words, 4)
        else:
            while len(words) > 4:
                words.pop()
        return words
Ejemplo n.º 11
0
 def plan(self, text):
     possible_result = split_sentences(text)
     for i in range(len(possible_result)):
         target = self.translater.translate(possible_result[i])
         if target != "BAD TAG !":
             possible_result[i] = target
         else:
             possible_result.pop(i)
     keywords = sorted(possible_result, key=lambda x: self.ranks[x])
     words = [keywords[idx] for idx in \
             [i for i in range(len(keywords)) if 0 == i or keywords[i] != keywords[i-1]]]
     if len(words) < 2:
         self.expand(words, 2)
     else:
         while len(words) > 2:
             words.pop()
     return words
Ejemplo n.º 12
0
 def __init__(self):
     self.poems = list()
     for corpus_name in _corpus_list:
         corpuspath = os.path.join(raw_dir, corpus_name)
         if not check_uptodate(corpuspath):
             _gen_poems()
     if not check_uptodate(poems_path):
         _gen_poems()
     for corpus_name in _corpus_list:
         corpuspath = os.path.join(raw_dir, corpus_name)
         with open(corpuspath, 'r') as fr:
             for line in fr:
                 # print(line)
                 sentences = split_sentences(
                     line.strip('\r\n ').split()[-1])
                 # print(sentences)
                 self.poems.append(sentences)
         print('self.poems==>', len(self.poems))
Ejemplo n.º 13
0
def _gen_poems():
    print("Parsing poems ...")
    char_dict = CharDict()
    with open(poems_path, 'w') as fout:
        for corpus in _corpus_list:
            with open(os.path.join(raw_dir, corpus), 'r') as fin:
                for line in fin.readlines()[1:]:
                    sentences = split_sentences(line.strip().split()[-1])
                    all_char_in_dict = True
                    for sentence in sentences:
                        for ch in sentence:
                            if char_dict.char2int(ch) < 0:
                                all_char_in_dict = False
                                break
                        if not all_char_in_dict:
                            break
                    if all_char_in_dict:
                        fout.write(' '.join(sentences) + '\n')
            print("Finished parsing %s." % corpus)
Ejemplo n.º 14
0
def _parse_corpus(raw_file, json_file):
    print("Parsing %s ..." % raw_file, end=' ')
    #use in linux
    #sys.stdout.flush()
    rdict = RhymeUtil()
    data = []
    with codecs.open(raw_file, 'r', 'utf-8') as fin:
        tags = fin.readline().strip().split('\t')
        line = fin.readline().strip()
        while line:
            toks = line.split('\t')
            poem = {'source': os.path.basename(raw_file)}
            for idx, tok in enumerate(toks):
                if tags[idx] != 'body':
                    poem[tags[idx]] = tok
                else:
                    body = tok
            flag = True
            left = body.find('(')
            while left >= 0:
                right = body.find(')')
                if right < left:
                    flag = False
                    break
                else:
                    body = body[:left] + body[right + 1:]
                    left = body.find('(')
            if flag and body.find(')') < 0:
                poem['sentences'] = split_sentences(body)
                for sentence in poem['sentences']:
                    if not reduce(lambda x, ch: x and rdict.has_char(ch),
                                  sentence, True):
                        flag = False
                        break
                if flag:
                    data.append(poem)
            line = fin.readline().strip()
    with codecs.open(json_file, 'w', 'utf-8') as fout:
        json.dump(data, fout)
    print("Done (%d poems)" % len(data))
    return data
Ejemplo n.º 15
0
def _gen_poems():
    print("Parsing poems ...")
    word_dict = wordDict()
    with open(poems_path, 'w', encoding='utf-8') as fout:
        for corpus in _corpus_list:
            with open(os.path.join(raw_dir, corpus), 'r',
                      encoding='utf-8') as fin:
                for line in fin.readlines():
                    sentences = split_sentences(line)
                    if len(sentences[0].split()) != 3:
                        continue
                    all_word_in_dict = True
                    for sentence in sentences:
                        sentence = sentence.strip().split()
                        for ch in sentence:
                            if word_dict.word2int(ch) < 0:
                                all_word_in_dict = False
                                break
                        if not all_word_in_dict:
                            break
                    if all_word_in_dict:
                        fout.write('|'.join(sentences) + '\n')
            print("Finished parsing %s." % corpus)
Ejemplo n.º 16
0
def _gen_poems():
    print("Parsing poems ...")
    chardict = CharDict()
    corpus = list()
    for corpus_name in _corpus_list:
        corpuspath = os.path.join(raw_dir, corpus_name)
        with open(corpuspath, 'r') as fr:
            for index, line in enumerate(fr):
                if index == 0:
                    continue
                all_in_char = True
                sentences = split_sentences(line.split()[3])
                for sentence in sentences:
                    for char in sentence:
                        if chardict[char] < 0:
                            all_in_char = False
                            # raise ValueError('char\t{}\t不在char_dict里边??'.format(char))
                if all_in_char:
                    corpus.append(sentences)
    corpus_sorted = sorted(corpus, key=lambda x: (-len(x[0]), -len(x)))
    with open(poems_path, 'w') as fw:
        for sentences in corpus_sorted:
            fw.write(' '.join(sentences) + '\n')
    print("Finished parsing %s." % corpus)
Ejemplo n.º 17
0
    def segment(self, sentence):
        # TODO: try CRF-based segmentation.
        toks = []
        idx = 0
        while idx + 4 <= len(sentence):
            # Cut 2 chars each time.
            if sentence[idx:idx + 2] in self.sxhy_dict:
                toks.append(sentence[idx:idx + 2])
            else:
                for tok in jieba.lcut(sentence[idx:idx + 2]):
                    toks.append(tok)
            idx += 2
        # Cut last 3 chars.
        if idx < len(sentence):
            if sentence[idx:] in self.sxhy_dict:
                toks.append(sentence[idx:])
            else:
                for tok in jieba.lcut(sentence[idx:]):
                    toks.append(tok)
        return toks


# For testing purpose.
if __name__ == '__main__':
    segmenter = Segmenter()
    with open(os.path.join(raw_dir, 'qts_tab.txt'), 'r') as fin:
        for line in fin.readlines()[1:6]:
            for sentence in split_sentences(line.strip().split()[3]):
                print(' '.join(segmenter.segment(sentence)))
Ejemplo n.º 18
0
def extract_relation_each_file(source_filepath, ere_filepath,
                               annotation_filepath, part_name, with_none):
    relation_records_each_file = []

    source_fp = open(source_filepath)
    all_source_text = source_fp.read().decode("utf-8")  # 注意编码
    source_fp.close()
    sentences = split_sentences(all_source_text)  # 分句

    ere_file = xml.dom.minidom.parse(ere_filepath)
    ere_root = ere_file.documentElement
    relation_list = ere_root.getElementsByTagName('relation')
    entity_list = ere_root.getElementsByTagName('entity')
    filler_list = ere_root.getElementsByTagName('filler')

    annotation_file = xml.dom.minidom.parse(annotation_filepath)
    annotation_root = annotation_file.documentElement
    annotation_belief_list = annotation_root.getElementsByTagName(
        'belief_annotations')
    annotation_belief_list = annotation_belief_list[0]
    annotation_relation_list = annotation_belief_list.getElementsByTagName(
        'relation')  # 实际上是relation_mention

    for i in range(len(relation_list)):
        # relation信息
        relation_id = relation_list[i].getAttribute('id')
        relation_type = relation_list[i].getAttribute('type')
        relation_subtype = relation_list[i].getAttribute('subtype')
        relation_mention_list = relation_list[i].getElementsByTagName(
            'relation_mention')

        for j in range(len(relation_mention_list)):
            # relation mention信息
            relation_mention_id = relation_mention_list[j].getAttribute('id')
            relation_mention_realis = relation_mention_list[j].getAttribute(
                'realis')

            # polarity
            for k in range((len(annotation_relation_list))):
                annotation_relation_id = annotation_relation_list[
                    k].getAttribute('ere_id')
                if annotation_relation_id == relation_mention_id:
                    be_em = annotation_relation_list[k].getElementsByTagName(
                        'belief')
                    # if len(st_em) == 0:
                    #     logger.info("错误:无情感标签。" + " " + part_name + " " + annotation_relation_id +
                    #                 " " + relation_mention_id)
                    label_type = be_em[0].getAttribute('type')
                    break
            if with_none is False and label_type == 'na':
                break  # 如果为none则丢弃该样本

            # rel_arg是entity
            # 基本信息
            rel_arg1 = relation_mention_list[j].getElementsByTagName(
                'rel_arg1')
            rel_arg1 = rel_arg1[0]
            rel_arg1_id = rel_arg1.getAttribute('entity_id')
            rel_arg1_mention_id = rel_arg1.getAttribute('entity_mention_id')
            rel_arg1_role = rel_arg1.getAttribute('role')
            rel_arg1_text = rel_arg1.firstChild.data
            # 所属entity及entity mention信息
            rel_arg1_entity_type, rel_arg1_entity_specificity, rel_arg1_mention_noun_type, rel_arg1_mention_offset, \
            rel_arg1_mention_length, rel_arg1_context, rel_arg1_window_text = rel_arg_entity_info(entity_list, rel_arg1_id,
                                                                            rel_arg1_mention_id, rel_arg1_text,
                                                                            sentences)

            # rel_arg,同上
            rel_arg2 = relation_mention_list[j].getElementsByTagName(
                'rel_arg2')
            rel_arg2 = rel_arg2[0]
            rel_arg2_role = rel_arg2.getAttribute('role')
            rel_arg2_text = rel_arg2.firstChild.data
            rel_arg2_id = rel_arg2.getAttribute('entity_id')
            if rel_arg2_id != '':
                rel_arg2_mention_id = rel_arg2.getAttribute(
                    'entity_mention_id')
                # 所属entity及entity mention信息
                rel_arg2_entity_type, rel_arg2_entity_specificity, rel_arg2_mention_noun_type, rel_arg2_mention_offset, \
                rel_arg2_mention_length, rel_arg2_context, rel_arg2_window_text = rel_arg_entity_info(entity_list, rel_arg2_id,
                                                                            rel_arg2_mention_id, rel_arg2_text,
                                                                            sentences)
                rel_arg2_is_filler = 0
            else:  # rel_arg2有的不是entity是filler,先简单处理
                rel_arg2_is_filler = 1
                rel_arg2_id = rel_arg2.getAttribute('filler_id')
                # if rel_arg2_id == '':
                #     logger.info("错误:参数不是entity或filler。" + " " + part_name + " " + relation_mention_id)
                rel_arg2_entity_type, rel_arg2_mention_offset, rel_arg2_mention_length, rel_arg2_context, rel_arg2_window_text = \
                    rel_arg_filler_info(filler_list, rel_arg2_id, rel_arg2_text, sentences)
                rel_arg2_mention_id = ''
                rel_arg2_entity_specificity = ''
                rel_arg2_mention_noun_type = ''

            # trigger
            trigger = relation_mention_list[j].getElementsByTagName(
                'trigger')  # ?待查
            if len(trigger) == 0:
                trigger_offset = 0
                trigger_length = 0
                trigger_text = ""
                trigger_context = ""
            else:
                trigger = trigger[0]
                trigger_offset = int(trigger.getAttribute('offset'))
                trigger_length = int(trigger.getAttribute('length'))
                trigger_text = trigger.firstChild.data
                # 上下文信息
                above = 0
                below = 0  # 可调,考虑trigger中at等词较多,似乎不宜太长上下文,这里先只提取当前句子
                trigger_context_dict = find_context(trigger_offset, sentences,
                                                    trigger_text, above, below)
                # 拼成一个字符串
                trigger_context = context_dict_to_string(
                    trigger_context_dict, above, below)

            # actual source
            source = be_em[0].getElementsByTagName('source')
            if label_type == 'na' or len(source) == 0:
                source_id = ''
                source_offset = 0
                source_length = 0
                source_text = ''
            else:
                source = source[0]
                source_id = source.getAttribute('ere_id')
                source_offset = int(source.getAttribute('offset'))
                source_length = int(source.getAttribute('length'))
                source_text = source.firstChild.data

            relation_record = {
                'file': part_name,
                'relation_id': relation_id,
                'relation_type': relation_type,
                'relation_subtype': relation_subtype,
                'relation_mention_id': relation_mention_id,
                'relation_mention_realis': relation_mention_realis,
                'rel_arg1_id': rel_arg1_id,
                'rel_arg1_mention_id': rel_arg1_mention_id,
                'rel_arg1_role': rel_arg1_role,
                'rel_arg1_text': rel_arg1_text,
                'rel_arg1_entity_type': rel_arg1_entity_type,
                'rel_arg1_entity_specificity': rel_arg1_entity_specificity,
                'rel_arg1_mention_noun_type': rel_arg1_mention_noun_type,
                'rel_arg1_mention_offset': rel_arg1_mention_offset,
                'rel_arg1_mention_length': rel_arg1_mention_length,
                'rel_arg1_context': rel_arg1_context,
                'rel_arg1_window_text': rel_arg2_window_text,
                'rel_arg2_id': rel_arg2_id,
                'rel_arg2_mention_id': rel_arg2_mention_id,
                'rel_arg2_role': rel_arg2_role,
                'rel_arg2_text': rel_arg2_text,
                'rel_arg2_entity_type': rel_arg2_entity_type,
                'rel_arg2_entity_specificity': rel_arg2_entity_specificity,
                'rel_arg2_mention_noun_type': rel_arg2_mention_noun_type,
                'rel_arg2_mention_offset': rel_arg2_mention_offset,
                'rel_arg2_mention_length': rel_arg2_mention_length,
                'rel_arg2_context': rel_arg2_context,
                'rel_arg2_is_filler': rel_arg2_is_filler,
                'rel_arg2_window_text': rel_arg2_window_text,
                'trigger_offset': trigger_offset,
                'trigger_length': trigger_length,
                'trigger_text': trigger_text,
                'trigger_context': trigger_context,
                'label_type': label_type,
                'source_id': source_id,
                'source_offset': source_offset,
                'source_length': source_length,
                'source_text': source_text
            }

            relation_records_each_file.append(relation_record)

    return relation_records_each_file
Ejemplo n.º 19
0
def extract_event_each_file(source_filepath, ere_filepath, annotation_filepath,
                            part_name, with_none):
    event_records_each_file = []
    em_args_each_file = []

    source_fp = open(source_filepath)
    all_source_text = source_fp.read().decode("utf-8")  # 注意编码
    source_fp.close()
    sentences = split_sentences(all_source_text)  # 分句

    ere_file = xml.dom.minidom.parse(ere_filepath)
    ere_root = ere_file.documentElement
    hopper_list = ere_root.getElementsByTagName('hopper')
    entity_list = ere_root.getElementsByTagName('entity')
    filler_list = ere_root.getElementsByTagName('filler')

    annotation_file = xml.dom.minidom.parse(annotation_filepath)
    annotation_root = annotation_file.documentElement
    annotation_belief_list = annotation_root.getElementsByTagName(
        'belief_annotations')
    annotation_belief_list = annotation_belief_list[0]
    annotation_event_list = annotation_belief_list.getElementsByTagName(
        'event')

    for i in range(len(hopper_list)):
        # hopper信息
        hopper_id = hopper_list[i].getAttribute('id')
        event_mention_list = hopper_list[i].getElementsByTagName(
            'event_mention')

        for j in range(len(event_mention_list)):
            # event信息
            event_mention_id = event_mention_list[j].getAttribute('id')
            event_mention_type = event_mention_list[j].getAttribute('type')
            event_mention_subtype = event_mention_list[j].getAttribute(
                'subtype')
            event_mention_realis = event_mention_list[j].getAttribute('realis')
            event_mention_ways = event_mention_list[j].getAttribute('ways')

            # polarity
            for k in range((len(annotation_event_list))):
                annotation_event_id = annotation_event_list[k].getAttribute(
                    'ere_id')
                if annotation_event_id == event_mention_id:
                    be_em = annotation_event_list[k].getElementsByTagName(
                        'belief')
                    # if len(st_em) == 0:
                    #     logger.info("错误:无情感标签。" + " " + part_name + " " + annotation_event_id +
                    #                 " " + event_mention_id)
                    label_type = be_em[0].getAttribute('type')
                    break
            if with_none is False and label_type == 'none':
                break  # 如果为none则丢弃该样本

            # trigger
            trigger = event_mention_list[j].getElementsByTagName('trigger')
            trigger = trigger[0]
            trigger_offset = int(trigger.getAttribute('offset'))
            trigger_length = int(trigger.getAttribute('length'))
            trigger_text = trigger.firstChild.data
            # 上下文信息
            above = 3
            below = 3  # 可调
            trigger_context_dict = find_context(trigger_offset, sentences,
                                                trigger_text, above, below)
            # 拼成一个字符串
            trigger_context = context_dict_to_string(trigger_context_dict,
                                                     above, below)
            # 从上下文中进一步提取窗口词
            window_length = 10
            sen = trigger_context_dict[0]['text']
            sen_offset = trigger_context_dict[0]['offset']
            window_text = get_window_text(window_length, sen, sen_offset,
                                          trigger_text, trigger_offset)

            # em_arg
            em_args = event_mention_list[j].getElementsByTagName('em_arg')
            em_arg_num = len(em_args)
            # print em_arg_num  # 一般不超过4个
            for em_arg in em_args:
                em_arg_role = em_arg.getAttribute('role')
                em_arg_text = em_arg.firstChild.data
                em_arg_id = em_arg.getAttribute('entity_id')
                if em_arg_id != "":  # 是entity
                    em_arg_mention_id = em_arg.getAttribute(
                        'entity_mention_id')
                    # 所属entity及entity mention信息
                    em_arg_entity_type, em_arg_entity_specificity, em_arg_mention_noun_type, em_arg_mention_offset, \
                    em_arg_mention_length, em_arg_context = em_arg_entity_info(entity_list, em_arg_id,
                                                                                    em_arg_mention_id, em_arg_text,
                                                                                    sentences)
                    em_arg_is_filler = 0  # 否
                else:
                    em_arg_id = em_arg.getAttribute('filler_id')
                    # if em_arg_id == "":
                    #     logger.info("错误:参数不是entity或filler。" + " " + part_name + " " + event_mention_id)
                    em_arg_entity_type, em_arg_mention_offset, em_arg_mention_length, em_arg_context = \
                        em_arg_filler_info(filler_list, em_arg_id, em_arg_text, sentences)
                    em_arg_mention_id = ""
                    em_arg_entity_specificity = ""
                    em_arg_mention_noun_type = ""
                    em_arg_is_filler = 1
                em_arg_record = {
                    'file': part_name,
                    'hopper_id': hopper_id,
                    'event_mention_id': event_mention_id,
                    'em_arg_id': em_arg_id,
                    'em_arg_mention_id': em_arg_mention_id,
                    'em_arg_role': em_arg_role,
                    'em_arg_text': em_arg_text,
                    'em_arg_entity_type': em_arg_entity_type,
                    'em_arg_entity_specificity': em_arg_entity_specificity,
                    'em_arg_mention_noun_type': em_arg_mention_noun_type,
                    'em_arg_mention_offset': em_arg_mention_offset,
                    'em_arg_mention_length': em_arg_mention_length,
                    'em_arg_context': em_arg_context,
                    'em_arg_is_filler': em_arg_is_filler
                }
                em_args_each_file.append(em_arg_record)

            # actual source
            source = be_em[0].getElementsByTagName('source')
            if label_type == 'na' or len(source) == 0:
                source_id = ''
                source_offset = 0
                source_length = 0
                source_text = ''
            else:
                source = source[0]
                source_id = source.getAttribute('ere_id')
                source_offset = int(source.getAttribute('offset'))
                source_length = int(source.getAttribute('length'))
                source_text = source.firstChild.data

            event_record = {
                'file': part_name,
                'hopper_id': hopper_id,
                'event_mention_id': event_mention_id,
                'event_mention_type': event_mention_type,
                'event_mention_subtype': event_mention_subtype,
                'event_mention_realis': event_mention_realis,
                'event_mention_ways': event_mention_ways,
                'trigger_offset': trigger_offset,
                'trigger_length': trigger_length,
                'trigger_text': trigger_text,
                'trigger_context': trigger_context,
                'trigger_window_text': window_text,
                'em_arg_num': em_arg_num,
                'label_type': label_type,
                'source_id': source_id,
                'source_offset': source_offset,
                'source_length': source_length,
                'source_text': source_text
            }

            event_records_each_file.append(event_record)

    return event_records_each_file, em_args_each_file