Python tokenizeの例、javalang.tokenizer.tokenize Pythonの例

コード例 #1

0

ファイルを表示

def mix_sources(source_A, source_B, from_line, to_line=-1):
    """Put a little bit of B into A
    """
    if to_line == -1:
        to_line = from_line

    file_A_lines = [line + '\n' for line in source_A.split('\n')]
    file_B_lines = [line + '\n' for line in source_B.split('\n')]

    tokens_A = tokenizer.tokenize(source_A)
    tokens_B = tokenizer.tokenize(source_B)

    tokens = zip(tokens_A, tokens_B)
    lines = range(from_line, to_line)

    output_source = ""

    first_part = ''.join(file_A_lines[:(from_line - 1)])
    output_source += first_part
    from_token = None
    first_token_of_A = None
    to_token = None
    last_token_of_A = None
    for token_A, token_B in tokens:
        if token_A.position[0] >= from_line and token_A.position[0] <= to_line:
            if 'form_token' not in locals():
                form_token = token_B
                first_token_of_A = token_A
            to_token = token_B
            last_token_of_A = token_A
    # print(first_token_of_A,last_token_of_A)
    if last_token_of_A:
        if first_token_of_A.position[0] != from_line:
            output_source += ''.join(
                file_A_lines[(from_line - 1):(first_token_of_A.position[0] -
                                              1)])
        output_source += " " * (first_token_of_A.position[1] - 1)
        output_source += source_B[(
            len(''.join(file_B_lines[:(form_token.position[0] - 1)])) +
            form_token.position[1] -
            1):(len(''.join(file_B_lines[:(to_token.position[0] - 1)])) +
                to_token.position[1] + len(to_token.value) - 1)]
        output_source += '\n'
        if last_token_of_A.position[0] != to_line:
            output_source += ''.join(
                file_A_lines[(last_token_of_A.position[0]):(to_line)])
        output_source += ''.join(file_A_lines[(to_line):])
    else:
        output_source += ''.join(file_A_lines[(from_line - 1):])

    return output_source

コード例 #2

0

ファイルを表示

def tokenize_java_code(code):
    byte_str = io.BytesIO(code).read()  # assume code is a `BytesIO` object
    string_obj = byte_str.decode('utf-8')  # Convert to a unicode object

    tokens = list(tokenizer.tokenize(string_obj))
    tokens = [token for t in tokens for token in t.value.split(" ")]
    return tokens

コード例 #3

0

ファイルを表示

ファイル: code_metrics.py プロジェクト: Vidya77/CodeCatch-RSSE

def count_max_identifier_occurences(snippet):
    '''
	Calculates the maximum occurences of any identifier in any line in snippet.
	'''
    top_freq_perline = [
    ]  # list with highest identifier frequency in each line

    for line in snippet:
        try:
            line_tokens = list(tokenizer.tokenize(line))
            line_identifiers = [
                token.value for token in line_tokens
                if type(token) == JAVA_IDENTIFIER
            ]
            identifier_freq = Counter(line_identifiers)
            if identifier_freq:  # avoid lines without any identifiers
                top_identifier_freq = identifier_freq.most_common(1)[0][1]
            else:
                top_identifier_freq = 0
            top_freq_perline.append(top_identifier_freq)
        except Exception as err:
            top_freq_perline = []
            break

    if top_freq_perline:
        max_identifier_freq = max(top_freq_perline)
    else:
        max_identifier_freq = 0

    return {'max_identifier_occurences': max_identifier_freq}

コード例 #4

0

ファイルを表示

ファイル: code_metrics.py プロジェクト: Vidya77/CodeCatch-RSSE

def count_identifiers(snippet):
    '''
	Calculates the average per line and maximum in any line identifiers in snippet.
	'''
    perline_identifiers = []
    for line in snippet:
        try:
            line_tokens = list(tokenizer.tokenize(line))
            perline_identifiers.append([
                token.value for token in line_tokens
                if type(token) == JAVA_IDENTIFIER
            ])
        except Exception as err:
            perline_identifiers = []
            break
    # maximum number of identifiers in any line
    if perline_identifiers:
        max_identifiers_perline = len(max(perline_identifiers, key=len))
        total_identifiers = sum([len(l) for l in perline_identifiers])
        avg_identifiers_perline = total_identifiers / len(perline_identifiers)
    else:
        max_identifiers_perline = 0
        avg_identifiers_perline = 0

    return {
        'max_identifiers_perline': max_identifiers_perline,
        'avg_identifiers_perline': avg_identifiers_perline,
    }

コード例 #5

0

ファイルを表示

  def tokenize_and_abstract(
      self,
      source_code):
    """As per the superclass."""
    try:
      java_tokens = tokenizer.tokenize(source_code)
    except tokenizer.LexerError as e:
      logging.warn('The tokenizer raised exception `%s` while parsing %s', e,
                   source_code)
      return (
          (cubert_tokenizer.quote_special(
              unified_tokenizer.TokenKind.ERROR.name),
           unified_tokenizer.TokenKind.ERROR),
          (cubert_tokenizer.quote_special(unified_tokenizer.TokenKind.EOS),
           unified_tokenizer.TokenKind.EOS),
      )

    agnostic_tokens: List[Tuple[str, unified_tokenizer.TokenKind]] = []

    for token in java_tokens:
      # The token kind is the subclass type of the token.
      token_type = type(token)
      if token_type not in JavaTokenizer._TOKEN_TYPE_MAP:
        raise ValueError('Received Java token type %s, but it was unexpected, '
                         'while tokenizing \n%s\n' % (token_type, source_code))

      agnostic_tokens.append(
          (token.value, JavaTokenizer._TOKEN_TYPE_MAP[token_type]))

    return agnostic_tokens

コード例 #6

0

ファイルを表示

ファイル: code_metrics.py プロジェクト: Vidya77/CodeCatch-RSSE

def count_identifier_length(snippet):
    '''
	Calculates the average and maximum identifier length in snippet.
	'''
    perline_identifiers = []
    total_lines = 0
    for line in snippet:
        total_lines += 1
        try:
            line_tokens = list(tokenizer.tokenize(line))
            perline_identifiers.append([
                token.value for token in line_tokens
                if type(token) == JAVA_IDENTIFIER
            ])
        except Exception as err:
            perline_identifiers = []
            break

    # concatenate sublists of identifiers into one list.
    total_identifiers = list(chain.from_iterable(perline_identifiers))
    if total_identifiers:
        max_identifier_length = len(max(total_identifiers, key=len))
        avg_identifier_length = sum(map(
            len, total_identifiers)) / len(total_identifiers)
    else:
        max_identifier_length = 0
        avg_identifier_length = 0

    return {
        'max_identifier_length': max_identifier_length,
        'avg_identifier_length': avg_identifier_length
    }

コード例 #7

0

ファイルを表示

ファイル: code_metrics.py プロジェクト: Vidya77/CodeCatch-RSSE

def count_numbers(snippet):
    '''
	Calculates the average per line and maximum in any line numbers in snippet.
	'''
    perline_numbers = []
    for line in snippet:
        try:
            line_tokens = list(tokenizer.tokenize(line))
            perline_numbers.append([
                token.value for token in line_tokens
                if type(token) in JAVA_NUMBER
            ])
        except Exception as err:
            perline_numbers = []
            break

    if perline_numbers:
        # maximum number of number variables in any line
        max_numbers_perline = len(max(perline_numbers, key=len))

        # total number variables in snippet
        total_numbers = sum([len(l) for l in perline_numbers])

        # average numbers per line in snippet
        avg_numbers_perline = total_numbers / len(perline_numbers)
    else:
        max_numbers_perline = 0
        avg_numbers_perline = 0

    return {
        'max_numbers_perline': max_numbers_perline,
        'avg_numbers_perline': avg_numbers_perline
    }

コード例 #8

0

ファイルを表示

ファイル: code_metrics.py プロジェクト: Vidya77/CodeCatch-RSSE

def count_comparison_operators(snippet):
    '''
	Calculates the average per line comparison operators in snippet.
	'''
    total_lines = 0
    total_operators = 0
    for line in snippet:
        total_lines += 1
        try:
            line_tokens = list(tokenizer.tokenize(line))
            total_operators += sum([
                1 for token in line_tokens
                if token.value in JAVA_COMPARISON_OPERATORS
            ])
        except Exception as err:
            total_operators = 0
            break

    if total_lines > 0:
        avg_comparison_operators_perline = total_operators / total_lines
    else:
        avg_comparison_operators_perline = 0

    return {
        'avg_comparison_operators_perline': avg_comparison_operators_perline
    }

コード例 #9

0

ファイルを表示

ファイル: code_metrics.py プロジェクト: Vidya77/CodeCatch-RSSE

def count_keywords(snippet):
    '''
	Calculates the average per line and maximum in any line keywords in snippet.
	'''
    perline_keywords = []
    for line in snippet:
        try:
            line_tokens = list(tokenizer.tokenize(line))
            perline_keywords.append([
                token.value for token in line_tokens
                if type(token) == JAVA_KEYWORD
            ])
        except Exception as err:
            perline_keywords = []
            break

    if perline_keywords:
        # maximum number of keywords in any line
        max_keywords_perline = len(max(perline_keywords, key=len))

        # total keywords in snippet
        total_keywords = sum([len(l) for l in perline_keywords])

        # average keywords per line in snippet
        avg_keywords_perline = total_keywords / len(perline_keywords)
    else:
        max_keywords_perline = 0
        avg_keywords_perline = 0

    return {
        'max_keywords_perline': max_keywords_perline,
        'avg_keywords_perline': avg_keywords_perline
    }

コード例 #10

0

ファイルを表示

ファイル: parse.py プロジェクト: lqc/javalang

def parse_member_signature(sig):
    if not sig.endswith(';'):
        sig = sig + ';'

    tokens = tokenize(sig)
    parser = Parser(tokens)

    return parser.parse_member_declaration()

コード例 #11

0

ファイルを表示

ファイル: parse.py プロジェクト: lqc/javalang

def parse_expression(exp):
    if not exp.endswith(';'):
        exp = exp + ';'

    tokens = tokenize(exp)
    parser = Parser(tokens)

    return parser.parse_expression()

コード例 #12

0

ファイルを表示

ファイル: parse.py プロジェクト: lqc/javalang

def parse_type_signature(sig):
    if sig.endswith(';'):
        sig = sig[:-1]
    sig = sig + '{ }'

    tokens = tokenize(sig)
    parser = Parser(tokens)

    return parser.parse_class_or_interface_declaration()

コード例 #13

0

ファイルを表示

ファイル: helpers.py プロジェクト: darshakpranpariya/pisco

def tokenize(code):
    """Tokenizes a given source code

    Args:
        tokens: list of string tokens
    """
    if code not in TOKENIZER_CACHE:
        TOKENIZER_CACHE[code] = map(lambda t: t.value,
                                    list(tokenizer.tokenize(code)))
    return TOKENIZER_CACHE[code]

コード例 #14

0

ファイルを表示

 def tokenize_code(self, code_snippet, identifier, verbose=0):
     code = self.parse_code(code_snippet, identifier)
     if code == ERROR_MESSAGE or code == EMPTY_MESSAGE:
         return []
     try:
         return [t.value for t in tokenizer.tokenize(code)]
     except Exception as e:
         if verbose == 1:
             print('\n'.join([code, e]))
         return []

コード例 #15

0

ファイルを表示

ファイル: parse.py プロジェクト: lqc/javalang

def parse_constructor_signature(sig):
    # Add an empty body to the signature, replacing a ; if necessary
    if sig.endswith(';'):
        sig = sig[:-1]
    sig = sig + '{ }'

    tokens = tokenize(sig)
    parser = Parser(tokens)

    return parser.parse_member_declaration()

コード例 #16

0

ファイルを表示

    def tokenize_and_abstract(self, source_code):
        """As per the superclass."""
        agnostic_tokens: List[unified_tokenizer.AbstractToken] = []

        try:
            java_tokens = tokenizer.tokenize(source_code)

            for token in java_tokens:
                # The token kind is the subclass type of the token.
                token_type = type(token)
                if token_type not in JavaTokenizer._TOKEN_TYPE_MAP:
                    raise ValueError(
                        'Received Java token type %s, but it was unexpected, '
                        'while tokenizing \n%s\n' % (token_type, source_code))

                # The tokenizer seems to take some liberties with Unicode, returning
                # invalid characters. This cleans spellings up.
                spelling = token.value.encode('utf-8',
                                              errors='replace').decode('utf-8')

                agnostic_tokens.append(
                    unified_tokenizer.AbstractToken(
                        spelling,
                        JavaTokenizer._TOKEN_TYPE_MAP[token_type],
                        unified_tokenizer.
                        TokenMetadata(start=unified_tokenizer.Position(
                            # JavaTokenizer counts lines and columns from 1.
                            line=token.position.line - 1,
                            column=token.position.column - 1))))
        except (tokenizer.LexerError, TypeError) as e:
            # Sometimes, javalang returns a TypeError when reading a number.
            # See
            # https://github.com/c2nes/javalang/blob/0664afb7f4d40254312693f2e833c1ed4ac551c7/javalang/tokenizer.py#L370
            logging.warn(
                'The tokenizer raised exception `%r` while parsing %s', e,
                source_code)
            agnostic_tokens.append(
                unified_tokenizer.AbstractToken(
                    cubert_tokenizer.quote_special(
                        unified_tokenizer.TokenKind.ERROR.name),
                    unified_tokenizer.TokenKind.ERROR,
                    unified_tokenizer.TokenMetadata()))

        # javalang doesn't seem to ever return `EndOfinput` despite there being a
        # token type for it. We insert it here.
        agnostic_tokens.append(
            unified_tokenizer.AbstractToken(
                cubert_tokenizer.quote_special(
                    unified_tokenizer.TokenKind.EOS.name),
                unified_tokenizer.TokenKind.EOS,
                unified_tokenizer.TokenMetadata()))

        return agnostic_tokens

コード例 #17

0

ファイルを表示

def tokenize_with_white_space(file_content,
                              relative=True,
                              new_line_at_the_end_of_file=True):
    """
    Tokenize the java source code
    :param file_content: the java source code
    :return: (whitespace, tokens)
    """
    position_last_line = 1
    tokens = tokenizer.tokenize(file_content, parse_comments=True)
    tokens = [t for t in tokens]
    whitespace = list()
    for index in range(0, len(tokens) - 1):
        tokens_position = tokens[index].position
        next_token_position = tokens[index + 1].position
        end_of_token = (tokens_position[0],
                        tokens_position[1] + len(tokens[index].value))
        if end_of_token == next_token_position:
            whitespace.append((0, 0))
        else:
            if (end_of_token[0] == next_token_position[0]):
                # same line
                whitespace.append(
                    (0, next_token_position[1] - end_of_token[1]))
            else:
                # new line
                if relative:
                    whitespace.append(
                        (next_token_position[0] - end_of_token[0] -
                         tokens[index].value.count('\n'),
                         next_token_position[1] - position_last_line))
                    position_last_line = next_token_position[1]
                else:
                    whitespace.append(
                        (next_token_position[0] - end_of_token[0] -
                         tokens[index].value.count('\n'),
                         next_token_position[1]))
    if new_line_at_the_end_of_file:
        whitespace.append((1, 0))
    else:
        if file_content[-1] == '\n':
            if file_content[-2] == '\n':
                whitespace.append((2, 0))
            else:
                whitespace.append((1, 0))
        else:
            whitespace.append((0, 0))
    # rewritten = reformat(whitespace, tokens)
    # print(rewritten)
    # return rewritten
    return whitespace, tokens

コード例 #18

0

ファイルを表示

ファイル: java_token_diff.py プロジェクト: mstmhsmt/cca

def get_tokens(path):
    toks = []
    try:
        with open(path, 'r') as f:
            for tok in tokenizer.tokenize(f.read()):
                toks.append(tok.value)
    except Exception as e:
        pass

    seq = []

    while True:
        try:
            tok = toks.pop(0)

            if tok == '.':
                try:
                    nxt = toks.pop(0)
                    r = '.' + nxt
                    if seq:
                        if seq[-1] not in (',','('):
                            seq[-1] += r
                        else:
                            seq.append(r)
                    else:
                        seq.append(r)

                except IndexError:
                    seq.append(tok)

            elif tok == ',':
                try:
                    nxt = toks.pop(0)
                    if nxt in ('}', ';'):
                        seq.append(nxt)
                    else:
                        seq.append(tok)
                        seq.append(nxt)

                except IndexError:
                    seq.append(tok)

            else:
                seq.append(tok)

        except IndexError:
            break

    return seq

コード例 #19

0

ファイルを表示

ファイル: summary.py プロジェクト: gchaperon/java-embeddings

def process_source_code(tup):
    """Get number of tokens and number of lines"""
    code, fname = tup

    try:
        tokens = list(tokenize(code))
    except LexerError as e:
        return Data(bad_syntax=1)
    except Exception as e:
        return Data(library_errors=1)

    if tokens:
        return Data(tokens=len(tokens), lines=tokens[-1].position.line)
    else:
        return Data(empty=1)

コード例 #20

0

ファイルを表示

ファイル: code_metrics.py プロジェクト: Vidya77/CodeCatch-RSSE

def count_commas(snippet):
    '''
	Calculates the average per line periods (,) in snippet.
	'''
    total_lines = 0
    total_commas = 0
    for line in snippet:
        total_lines += 1
        try:
            line_tokens = list(tokenizer.tokenize(line))
            total_commas += sum(
                [1 for token in line_tokens if token.value == ","])
        except Exception as err:
            total_commas = 0
            break

    if total_lines > 0:
        avg_commas_perline = total_commas / total_lines
    else:
        avg_commas_perline = 0

    return {'avg_commas_perline': avg_commas_perline}

コード例 #21

0

ファイルを表示

ファイル: code_metrics.py プロジェクト: Vidya77/CodeCatch-RSSE

def count_assignments(snippet):
    '''
	Calculates the average per line assignments (=) in snippet.
	'''
    total_lines = 0
    total_assignments = 0
    for line in snippet:
        total_lines += 1
        try:
            line_tokens = list(tokenizer.tokenize(line))
            total_assignments += sum(
                [1 for token in line_tokens if token.value == "="])
        except Exception as err:
            total_assignments = 0
            break

    if total_lines > 0:
        avg_assignments_perline = total_assignments / total_lines
    else:
        avg_assignments_perline = 0

    return {'avg_assignments_perline': avg_assignments_perline}

コード例 #22

0

ファイルを表示

ファイル: code_metrics.py プロジェクト: Vidya77/CodeCatch-RSSE

def count_parenthesis(snippet):
    '''
	Calculates the average per line parenthesis ((, )) in snippet.
	'''
    total_parenthesis = 0
    total_lines = 0
    for line in snippet:
        total_lines += 1
        try:
            line_tokens = list(tokenizer.tokenize(line))
            total_parenthesis += sum([
                1 for token in line_tokens
                if token.value == "(" or token.value == ")"
            ])
        except Exception as err:
            total_parenthesis = 0
            break

    if total_lines > 0:
        avg_parenthesis_perline = total_parenthesis / total_lines
    else:
        avg_parenthesis_perline = 0

    return {'avg_parenthesis_perline': avg_parenthesis_perline}

コード例 #23

0

ファイルを表示

ファイル: code_metrics.py プロジェクト: Vidya77/CodeCatch-RSSE

def count_branches(snippet):
    '''
	Calculates the average per line branches (if/switch) in snippet.
	'''
    total_branches = 0
    total_lines = 0
    for line in snippet:
        total_lines += 1
        try:
            line_tokens = list(tokenizer.tokenize(line))
            total_branches += sum([
                1 for token in line_tokens
                if token.value == "if" or token.value == "switch"
            ])
        except Exception as err:
            total_branches = 0
            break

    if total_lines > 0:
        avg_branches_perline = total_branches / total_lines
    else:
        avg_branches_perline = 0

    return {'avg_branches_perline': avg_branches_perline}

コード例 #24

0

ファイルを表示

ファイル: code_metrics.py プロジェクト: Vidya77/CodeCatch-RSSE

def count_loops(snippet):
    '''
	Calculates the average per line loops (for/while) in snippet.
	'''
    total_loops = 0
    total_lines = 0
    for line in snippet:
        total_lines += 1
        try:
            line_tokens = list(tokenizer.tokenize(line))
            total_loops += sum([
                1 for token in line_tokens
                if token.value == "for" or token.value == "while"
            ])
        except Exception as err:
            total_loops = 0
            break

    if total_lines > 0:
        avg_loops_perline = total_loops / total_lines
    else:
        avg_loops_perline = 0

    return {'avg_loops_perline': avg_loops_perline}

コード例 #25

0

ファイルを表示

ファイル: tokenizer.py プロジェクト: KTH/styler

def tokenize_with_white_space(file_content, relative=True):
    """
    Tokenize the java source code
    :param file_content: the java source code
    :return: (whitespace, tokens)
    """
    indentation_last_line = 1
    file_content_lines = file_content.split('\n')
    javalang_tokens = javalang_tokenizer.tokenize(file_content, parse_comments=True)
    tokens = []
    count = 0
    try:
        for t in javalang_tokens:
            count += 1
            if count > 1000000:
                break
            tokens.append(t)
            pass
    except Exception as err:
        print('Something wrong happened while tokenizing the following content: ' + file_content)
        return None, None
    whitespace = list()
    for index in range(0, len(tokens)-1):
        tokens_position = tokens[index].position
        next_token_position = tokens[index+1].position
        end_of_token = (tokens_position[0], tokens_position[1] + len(tokens[index].value))
        if end_of_token == next_token_position:
            whitespace.append((0,0,'None'))
        else:
            if end_of_token[0] == next_token_position[0]:
                # same line
                if file_content_lines[tokens_position[0]-1] is not '':
                    if len(file_content_lines[tokens_position[0]-1]) > end_of_token[1] and file_content_lines[tokens_position[0]-1][end_of_token[1]] == '\t':
                        space_type = 'TB'
                    else:
                        space_type = 'SP'
                else:
                    space_type = 'None'
                whitespace.append(( 0, next_token_position[1] - end_of_token[1], space_type))
            else:
                # new line
                new_line = file_content_lines[next_token_position[0]-1]
                if new_line is not '':
                    if new_line[get_line_indent(new_line) - 1] == '\t':
                        space_type = 'TB'
                    else:
                        space_type = 'SP'
                else:
                    space_type = 'None'
                if relative:
                    spaces = next_token_position[1] - indentation_last_line
                    whitespace.append((next_token_position[0] - end_of_token[0] - tokens[index].value.count('\n'), spaces, space_type))
                    indentation_last_line = next_token_position[1]
                else:
                    whitespace.append((next_token_position[0] - end_of_token[0] - tokens[index].value.count('\n'), next_token_position[1] - 1, space_type))
    
    count_line_break = 0
    for index in range(len(file_content)-1, 0, -1):
        if file_content[index] == '\n':
            count_line_break += 1
        elif file_content[index] != ' ' and file_content[index] != '\t':
            break

    whitespace.append((count_line_break, 0, 'None'))

    return whitespace, tokens

コード例 #26

0

ファイルを表示

ファイル: parse.py プロジェクト: lqc/javalang

def parse(s):
    tokens = tokenize(s)
    parser = Parser(tokens)
    return parser.parse()

コード例 #27

0

ファイルを表示

def gen_ugly(file_path, output_dir, modification_number=(1, 0, 0, 0, 0)):
    """
    Gen an ugly vertsion of of .java file
    """
    insertions_sample_size_space = modification_number[0]
    insertions_sample_size_tab = modification_number[1]
    insertions_sample_size_newline = modification_number[2]
    insertions_sample_size = insertions_sample_size_space + insertions_sample_size_tab + insertions_sample_size_newline
    deletions_sample_size_space = modification_number[3]
    deletions_sample_size_newline = modification_number[4]
    deletions_sample_size = deletions_sample_size_space + deletions_sample_size_newline
    # deletions_sample_size = modification_number - insertions_sample_size
    with open(file_path) as f:
        file_lines = f.readlines()
    file_content = "".join(file_lines)

    tokens = tokenizer.tokenize(file_content)
    tokens = [t for t in tokens]
    # print("\n".join([ str(t) for t in tokens]))

    # Take a sample of locations suitable for insertions
    insertions_sample = random.sample(tokens,
                                      min(insertions_sample_size, len(tokens)))

    insertions = dict()

    insertions_chars = ([' '] * insertions_sample_size_space)
    insertions_chars.extend(['\t'] * insertions_sample_size_tab)
    insertions_chars.extend(['\n'] * insertions_sample_size_newline)
    random.shuffle(insertions_chars)

    for element, char in zip(insertions_sample, insertions_chars):
        insertions[element.position] = char

    # Select every locations suitable for deletions (i.e. before or after a separator/operator)
    deletions_spots = list()
    suitable_for_deletions = [tokenizer.Separator, tokenizer.Operator]
    for index in range(0, len(tokens) - 1):
        if (type(tokens[index]) in suitable_for_deletions):
            prev_token_position = tokens[index - 1].position
            tokens_position = tokens[index].position
            next_token_position = tokens[index + 1].position
            end_of_prev_token = (prev_token_position[0],
                                 prev_token_position[1] +
                                 len(tokens[index - 1].value))
            end_of_token = (tokens_position[0],
                            tokens_position[1] + len(tokens[index].value))
            if (end_of_prev_token != tokens_position):
                #print("prev : ", tokens[index-1].value , tokens[index].value, tokens[index+1].value, tokens[index].position)
                deletions_spots.append((end_of_prev_token, tokens_position))
            if (end_of_token != next_token_position):
                #print("next : ", tokens[index-1].value , tokens[index].value, tokens[index+1].value, tokens[index].position)
                deletions_spots.append((end_of_token, next_token_position))
    deletions_spots = list(set(deletions_spots))

    # Take a sample of locations suitable for deletions
    deletions_sample = random.sample(
        deletions_spots, min(deletions_sample_size, len(deletions_spots)))

    deletions = dict()
    for deletion_intervals in deletions_spots:
        #print(deletion_intervals)
        from_char = deletion_intervals[0]
        to_char = deletion_intervals[1]
        while from_char[0] <= to_char[0]:
            if from_char[0] == to_char[0]:
                interval = I.closedopen(from_char[1], to_char[1])
            else:
                interval = I.closedopen(from_char[1], I.inf)
            if (from_char[0] not in deletions):
                deletions[from_char[0]] = list()
            deletions[from_char[0]].append(interval)
            from_char = (from_char[0] + 1, 0)

    deletions_spots_chars = dict()
    line_num = 1
    for line in file_lines:
        char_num = 1
        for char in line:
            if (line_num in deletions):
                for intervals in deletions[line_num]:
                    if char_num in intervals:
                        if (char not in deletions_spots_chars):
                            deletions_spots_chars[char] = []
                        deletions_spots_chars[char].append(
                            (line_num, char_num))
            char_num = char_num + 1
        line_num = line_num + 1

    deletions = []
    if (' ' in deletions_spots_chars):
        deletions.extend(
            random.sample(deletions_spots_chars[' '],
                          deletions_sample_size_space))
    if ('\n' in deletions_spots_chars):
        deletions.extend(
            random.sample(deletions_spots_chars['\n'],
                          deletions_sample_size_newline))

    # print(insertions)
    # print(deletions)

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    output_path = os.path.join(output_dir, f'./{file_path.split("/")[-1]}')

    # Write the output file
    with open(output_path, "w") as output_file_object:
        line_num = 1
        for line in file_lines:
            char_num = 1
            for char in line:
                skip = False
                if ((line_num, char_num) in deletions):
                    skip = True
                if ((line_num, char_num) in insertions):
                    output_file_object.write(insertions[(line_num, char_num)])
                if (not skip):
                    output_file_object.write(char)
                char_num = char_num + 1
            line_num = line_num + 1
    return tuple(set(deletions) | set(insertions.keys()))

コード例 #28

0

ファイルを表示

def _parser(snippet):
    return Parser(tokenize(snippet))

コード例 #29

0

ファイルを表示

ファイル: java_tokenize.py プロジェクト: leod/bobcode

def tokenize(s, out, skip_license=False):
    prev_line = 1
    prev_column = 1

    had_package = False

    for tok in tokenizer.tokenize(s):
        num_newlines = tok.position[0] - prev_line
        if num_newlines > 0:
            out.write(NEWLINE_SYMBOL * num_newlines)
            out.write('\n')
            prev_column = 1

        value = tok.value.strip()
        prev_line = tok.position[0] + value.count('\n')

        num_spaces = tok.position[1] - prev_column
        if num_spaces > 0:
            out.write(SPACE_SYMBOL * num_spaces)
            out.write(' ')

        prev_column = tok.position[1] + len(tok.value)

        if isinstance(tok, tokenizer.Keyword) and tok.value == 'package':
            had_package = True

        if skip_license and isinstance(tok,
                                       tokenizer.Comment) and not had_package:
            prev_line += 1
            continue

        # Split quotes from values
        if isinstance(tok, tokenizer.String):
            out.write('" ')
            value = value[1:-1]
        if isinstance(tok, tokenizer.Character):
            out.write("' ")
            value = value[1:-1]

        if isinstance(tok, tokenizer.String) or isinstance(
                tok, tokenizer.Comment):
            # Join consecutive space symbols or newline symbols into a single word,
            # so that whitespace is encoded the same between tokens and within
            # tokens.
            value = replace_consecutive(value,
                                        x=' ',
                                        y=SPACE_SYMBOL,
                                        y_after=' ')
            value = replace_consecutive(value,
                                        x='\n',
                                        y=NEWLINE_SYMBOL,
                                        y_after='\n')

        out.write(value.strip(' '))
        out.write(' ')

        if isinstance(tok, tokenizer.String):
            out.write('" ')
        if isinstance(tok, tokenizer.Character):
            out.write("' ")

    out.write(EOF_SYMBOL + '\n')

コード例 #30

0

ファイルを表示

 def _tokenize(self, program_string):
     import javalang.tokenizer as tokenizer
     tokens = tokenizer.tokenize(program_string)
     return map(lambda token: (type(token), token.value),
                tokens), program_string

コード例 #31

0

ファイルを表示

ファイル: source_code.py プロジェクト: kaizhiyu/test-completion-transformer

def tokenize(code):
    return [token.value for token in tokenizer.tokenize(code)]