def process_examples(lang_id,
                     source,
                     source_tag,
                     target,
                     max_src_len,
                     max_tgt_len,
                     code_tag_type,
                     uncase=False,
                     test_split=True):
    code_tokens = source.split()
    code_type = []
    if source_tag is not None:
        code_type = source_tag.split()
        if len(code_tokens) != len(code_type):
            return None

    code_tokens = code_tokens[:max_src_len]
    code_type = code_type[:max_src_len]
    if len(code_tokens) == 0:
        return None

    TAG_TYPE_MAP = TOKEN_TYPE_MAP if \
        code_tag_type == 'subtoken' else AST_TYPE_MAP
    code = Code()
    code.text = source
    code.language = lang_id
    code.tokens = code_tokens
    code.type = [TAG_TYPE_MAP.get(ct, 1) for ct in code_type]
    if code_tag_type != 'subtoken':
        code.mask = [1 if ct == 'N' else 0 for ct in code_type]

    if target is not None:
        summ = target.lower() if uncase else target
        summ_tokens = summ.split()
        if not test_split:
            summ_tokens = summ_tokens[:max_tgt_len]
        if len(summ_tokens) == 0:
            return None
        summary = Summary()
        summary.text = ' '.join(summ_tokens)
        summary.tokens = summ_tokens
        summary.prepend_token(BOS_WORD)
        summary.append_token(EOS_WORD)
    else:
        summary = None

    example = dict()
    example['code'] = code
    example['summary'] = summary
    return example
Exemple #2
0
def process_examples(lang_id,
                     source,
                     source_tag,
                     target,
                     rel_matrix,
                     max_src_len,
                     max_tgt_len,
                     code_tag_type,
                     uncase=False,
                     test_split=True,
                     split_tokens=False):
    code_tokens = source.split()
    code_type = []
    if source_tag is not None:
        code_type = source_tag.split()
        if len(code_tokens) != len(code_type):
            return None
    if rel_matrix is not None:
        if len(rel_matrix) != len(code_tokens):
            raise ValueError("len(rel_matrix) != len(code_tokens): %d %d" % \
                            len(rel_matrix), len(code_tokens))
        rel_matrix = [s.split() for s in rel_matrix]
    else:
        rel_matrix = []

    code_tokens = code_tokens[:max_src_len]
    code_type = code_type[:max_src_len]
    rel_matrix = rel_matrix[:max_src_len]
    
    if len(code_tokens) == 0:
        return None

    TAG_TYPE_MAP = TOKEN_TYPE_MAP if \
        code_tag_type == 'subtoken' else AST_TYPE_MAP
    code = Code()
    code.text = source
    code.language = lang_id
    code.tokens = code_tokens
    if split_tokens:
        code.subtokens = [token.split("_") for token in code_tokens]
        #print([token.split("_") for token in code_tokens])
    code.type = code_type
    #code.type = [TAG_TYPE_MAP.get(ct, 1) for ct in code_type]
    #if code_tag_type != 'subtoken':
    #    code.mask = [1 if ct == 'N' else 0 for ct in code_type]

    if target is not None:
        summ = target.lower() if uncase else target
        summ_tokens = summ.split()
        if not test_split:
            summ_tokens = summ_tokens[:max_tgt_len]
        if len(summ_tokens) == 0:
            return None
        summary = Summary()
        summary.text = ' '.join(summ_tokens)
        summary.tokens = summ_tokens
        summary.prepend_token(BOS_WORD)
        summary.append_token(EOS_WORD)
    else:
        summary = None
        
    if rel_matrix != []:
        rm = Code()
        rm.tokens = rel_matrix

    example = dict()
    example['code'] = code
    example['summary'] = summary
    if rel_matrix != []:
        example["rel_matrix"] = rm
    return example