def _preprocess(self, raw_title, raw_text): """ tokenize the input text and do some necessary process """ if raw_title is None: raw_title = "" raw_title = raw_title.strip() # raw_title += (raw_title[-1] not in (".", "?", "!")) * "." if self.config.lower: raw_title = raw_title.lower() raw_text = raw_text.lower() title_tokens = meng17_tokenize(raw_title) text_tokens = meng17_tokenize(raw_text) tokens = title_tokens + ["."] + text_tokens if self.config.replace_digit: tokens = replace_numbers_to_DIGIT(tokens, k=2) return " ".join(tokens)
def heuristic_filter(src_token, tgts_token, tgts_str, opt): ''' tokenize and truncate data, filter examples that exceed the length limit :param src_tgts_pairs: :param tokenize: :return: ''' print('*' * 50) print('len(src)=%d, len(tgt)=%d' % (len(src_token), len(tgts_token))) print('src: %s' % str(src_token)) print('tgt: %s' % str(tgts_token)) print('*' * 50) # SOURCE FILTER: if length of src is over/under the given length limit, discard if opt.max_src_seq_length and len(src_token) > opt.max_src_seq_length: print("INVALID: source is too long [len=%d]: \n%s" % (len(src_token), str(src_token))) return False, None, None if opt.min_src_seq_length and len(src_token) < opt.min_src_seq_length: print("INVALID: source is too short [len=%d]: \n%s" % (len(src_token), str(src_token))) return False, None, None filtered_tgts_str = [] filtered_tgts_token = [] # Go over each keyphrase and check its validity for tgt_token, tgt_str in zip(tgts_token, tgts_str): tgt_token_for_filter = utils.meng17_tokenize(tgt_str) # FILTER 1: if length of tgt exceeds limit, discard if opt.max_tgt_seq_length and len(tgt_token_for_filter) > opt.max_tgt_seq_length: print("\tInvalid Target: target is too long: %s (originally %s)" % (str(tgt_token), tgt_str)) continue if opt.min_tgt_seq_length and len(tgt_token_for_filter) < opt.min_tgt_seq_length: print("\tInvalid Target: target is too short: %s (originally %s)" % (str(tgt_token), tgt_str)) continue # FILTER 2: ingore all the keyphrases that contains strange punctuations, very DIRTY data! punc_flag = False puncts = re.findall(r'[,_\"<>\(\){}\[\]\?~`!@$%\^=]', tgt_str) if len(puncts) > 0: print('-' * 50) print('Find punctuations in keyword: %s' % tgt_str) print('- tokens: %s' % str(tgt_token)) punc_flag = True # FILTER 3: check the quality of long keyphrases (>5 words) with a heuristic rule, repeating meaningless words heuristic_rule_flag = False if len(tgt_token_for_filter) > 5: tgt_set = set(tgt_token_for_filter) if len(tgt_set) * 2 < len(tgt_token_for_filter): print('\t Invalid Target: heuristic_rule on long keyphrases (>5 words)') heuristic_rule_flag = True # FILTER 4: filter keywords like primary 75v05;secondary 76m10;65n30 if (len(tgt_token_for_filter) > 0 and re.match(r'\d\d[a-zA-Z\-]\d\d', tgt_token_for_filter[0].strip())) \ or (len(tgt_token_for_filter) > 1 and re.match(r'\d\d\w\d\d', tgt_token_for_filter[1].strip())): print('\tInvalid Target: matching template \d\d[a-z]\d\d: %s' % tgt_str) continue if (punc_flag or heuristic_rule_flag): if heuristic_rule_flag: print('\t Invalid Target: heuristic_rule on long keyphrases (>5 words)') if punc_flag: print('\t Invalid Target: found punctuation in keyphrases') continue filtered_tgts_str.append(tgt_str) filtered_tgts_token.append(tgt_token) # ignore the examples that have zero valid targets, for training they are no helpful if len(filtered_tgts_str) == 0: print('INVALID: found no valid targets') return False, None, None return True, filtered_tgts_token, filtered_tgts_str
if opt.lower: title = title.lower() abstract = abstract.lower() keywords = [k.lower() for k in keywords] if opt.tokenizer == "str": title_token = [title] abstract_token = [abstract] keywords_token = keywords elif opt.tokenizer == "en_word": title_token = title.split(' ') abstract_token = abstract.split(' ') keywords_token = [kw.split(' ') for kw in keywords] elif opt.tokenizer == "meng17": title_token = utils.meng17_tokenize(title) abstract_token = utils.meng17_tokenize(abstract) keywords_token = [utils.meng17_tokenize(kw) for kw in keywords] elif opt.tokenizer == "en_retain_punc": title_token = utils.retain_punc_tokenize(title) abstract_token = utils.retain_punc_tokenize(abstract) keywords_token = [utils.retain_punc_tokenize(kw) for kw in keywords] elif opt.tokenizer == "en_subword": raise NotImplementedError else: raise NotImplementedError if opt.replace_digit: title_token = utils.replace_numbers_to_DIGIT(title_token, k=2) abstract_token = utils.replace_numbers_to_DIGIT(abstract_token, k=2) keywords_token = [utils.replace_numbers_to_DIGIT(kw, k=2) for kw in keywords_token]