def process_examples(lang_id, source, source_tag, target, max_src_len, max_tgt_len, code_tag_type, uncase=False, test_split=True): code_tokens = source.split() code_type = [] if source_tag is not None: code_type = source_tag.split() if len(code_tokens) != len(code_type): return None code_tokens = code_tokens[:max_src_len] code_type = code_type[:max_src_len] if len(code_tokens) == 0: return None TAG_TYPE_MAP = TOKEN_TYPE_MAP if \ code_tag_type == 'subtoken' else AST_TYPE_MAP code = Code() code.text = source code.language = lang_id code.tokens = code_tokens code.type = [TAG_TYPE_MAP.get(ct, 1) for ct in code_type] if code_tag_type != 'subtoken': code.mask = [1 if ct == 'N' else 0 for ct in code_type] if target is not None: summ = target.lower() if uncase else target summ_tokens = summ.split() if not test_split: summ_tokens = summ_tokens[:max_tgt_len] if len(summ_tokens) == 0: return None summary = Summary() summary.text = ' '.join(summ_tokens) summary.tokens = summ_tokens summary.prepend_token(BOS_WORD) summary.append_token(EOS_WORD) else: summary = None example = dict() example['code'] = code example['summary'] = summary return example
def process_examples(lang_id, source, source_tag, target, rel_matrix, max_src_len, max_tgt_len, code_tag_type, uncase=False, test_split=True, split_tokens=False): code_tokens = source.split() code_type = [] if source_tag is not None: code_type = source_tag.split() if len(code_tokens) != len(code_type): return None if rel_matrix is not None: if len(rel_matrix) != len(code_tokens): raise ValueError("len(rel_matrix) != len(code_tokens): %d %d" % \ len(rel_matrix), len(code_tokens)) rel_matrix = [s.split() for s in rel_matrix] else: rel_matrix = [] code_tokens = code_tokens[:max_src_len] code_type = code_type[:max_src_len] rel_matrix = rel_matrix[:max_src_len] if len(code_tokens) == 0: return None TAG_TYPE_MAP = TOKEN_TYPE_MAP if \ code_tag_type == 'subtoken' else AST_TYPE_MAP code = Code() code.text = source code.language = lang_id code.tokens = code_tokens if split_tokens: code.subtokens = [token.split("_") for token in code_tokens] #print([token.split("_") for token in code_tokens]) code.type = code_type #code.type = [TAG_TYPE_MAP.get(ct, 1) for ct in code_type] #if code_tag_type != 'subtoken': # code.mask = [1 if ct == 'N' else 0 for ct in code_type] if target is not None: summ = target.lower() if uncase else target summ_tokens = summ.split() if not test_split: summ_tokens = summ_tokens[:max_tgt_len] if len(summ_tokens) == 0: return None summary = Summary() summary.text = ' '.join(summ_tokens) summary.tokens = summ_tokens summary.prepend_token(BOS_WORD) summary.append_token(EOS_WORD) else: summary = None if rel_matrix != []: rm = Code() rm.tokens = rel_matrix example = dict() example['code'] = code example['summary'] = summary if rel_matrix != []: example["rel_matrix"] = rm return example