Example #1
0
    def unwrap_code_lines(tokens):
        unwrapped_tokens = []
        include = True
        prev_tokens = [
            Token('', 'newline', False),
            Token('', 'newline', False),
            Token('', 'newline', False)
        ]

        for token in tokens:
            if token.group == 'line continuation':
                include = False
                prev_tokens.append(token)
                prev_tokens = prev_tokens[1:]

            if token.group == 'whitespace' and \
              prev_tokens[-1].group == 'newline' and \
              prev_tokens[-2].group == 'line continuation':
                if prev_tokens[-3].group != 'whitespace':
                    unwrapped_tokens.append(Token(' ', 'whitespace', False))
            elif include:
                unwrapped_tokens.append(token)
                prev_tokens.append(token)
                prev_tokens = prev_tokens[1:]

            if token.group == 'newline':
                if not include:
                    prev_tokens.append(token)
                    prev_tokens = prev_tokens[1:]
                include = True

        return unwrapped_tokens
Example #2
0
    def tokenize_line(self, line, tokenizer, wide):
        # break apart the line based on fixed format
        tokens = []

        # The fixed-format PL/1 line format is:
        # 1: space or C or *
        # 2-72: program text
        # 73-: identification, traditionally sequence number (ignored)
        line_indicator = line[0:1]
        if wide:
            line_text = line[1:]
            line_identification = ''
        else:
            line_text = line[1:72]
            line_identification = line[72:]

        # tokenize the line indicator
        if line_indicator in ['C', '*']:
            tokens.append(Token(line, 'comment', False))
        else:
            if len(line_indicator) > 0 and line_indicator != ' ':
                tokens.append(Token(line_indicator, 'invalid', False))
            else:
                tokens.append(Token(' ', 'whitespace', False))
            # tokenize the code
            tokens += tokenizer.tokenize(line_text)

            # tokenize the line identification
            if len(line_identification) > 0:
                tokens.append(
                    Token(line_identification, 'line identification', False))

        tokens.append(Token('\n', 'newline', False))

        return tokens
Example #3
0
    def extract_keywords_from_identifiers(tokens, keywords, operators):
        new_tokens = []

        words = keywords + operators

        for token in tokens:
            if token.group == 'variable':
                new_texts = BasicExaminer.extract_keywords(token.text, words)

                for new_text in new_texts:
                    if new_text in keywords:
                        new_token = Token(new_text, 'keyword', False)
                    elif new_text in operators:
                        new_token = Token(new_text, 'operator', False)
                    else:
                        if new_text.isdigit():
                            new_token = Token(new_text, 'number', True)
                        else:
                            new_token = Token(new_text, 'variable', True)

                    new_tokens.append(new_token)
            else:
                new_tokens.append(token)

        return new_tokens
Example #4
0
    def calc_line_format_confidence(self):
        drop_types = ['whitespace', 'comment', 'line continuation']
        tokens = Examiner.drop_tokens(self.tokens, drop_types)

        line_bracket_count = 0
        num_bracket_count = 0
        prev2_token = Token('\n', 'newline', False)
        prev_token = Token('\n', 'newline', False)
        for token in tokens:
            if token.group == 'group' and token.text == '{':
                num_bracket_count += 1

                if prev_token.group == 'newline' and\
                  (prev2_token.group != 'group' or prev2_token.text != '{'):
                    line_bracket_count += 1
                    self.errors.append({
                        'TYPE': 'LINE FORMAT',
                        'TOKEN': token.text
                    })

            prev2_token = prev_token
            prev_token = token

        line_format_confidence = 1.0

        if num_bracket_count > 0:
            line_format_confidence = 1.0 - (line_bracket_count /
                                            num_bracket_count)

        self.confidences['line format'] = line_format_confidence
  def tokenize_alt_line(self, line, line_indicator):
    token = None

    if line_indicator in ['*', '/', 'D', 'd']:
      # the entire line is a comment (including DEBUG lines)
      token = Token(line[6:], 'comment', False)
    if line_indicator == '$':
      token = Token(line[6:], 'preprocessor', False)

    return token
Example #6
0
    def get_tokens(self):
        if self.text is None:
            return None

        if len(self.text) != 2:
            return None

        return [
            Token(self.text[0], 'identifier', True),
            Token(self.text[1], 'group', False)
        ]
Example #7
0
  def get_tokens(self):
    if self.text == None:
      return None

    token1 = Token('', 'comment', False)
    token2 = Token('', 'comment', False)

    if self.text.startswith('Remark', False):
      token1 = Token('Remark', 'keyword', False)
      token2 = Token(self.text[6:], 'comment', False)

    return [token1, token2]
Example #8
0
    def get_tokens(self):
        if self.text is None:
            return None

        if self.case_sensitive:
            if self.text.startswith(self.lead):
                return [Token(self.text, self.group, False)]
        else:
            if self.text.lower().startswith(self.lead):
                return [Token(self.text, self.group, False)]

        return None
Example #9
0
    def get_tokens(self):
        if self.token1 is None:
            return None

        if self.token2 is None:
            token1 = Token(self.token1, 'keyword', False)
            tokens = [token1]
        else:
            token1 = Token(self.token1, 'keyword', False)
            token2 = Token(self.token2, 'comment', False)
            tokens = [token1, token2]
        return tokens
  def tokenize_line_indicator(self, line_indicator):
    token = None

    if line_indicator == ' ':
      token = Token(' ', 'whitespace', False)
    else:
      if line_indicator == '-':
        token = Token(line_indicator, 'continuation line', False)
      else:
        if line_indicator != '':
          token = Token(line_indicator, 'invalid', False)

    return token
  def tokenize_line_number(self, line_number):
    token = None

    if len(line_number) > 0:
      if line_number.isspace():
        token = Token(line_number, 'whitespace', False)
      else:
        if line_number.isdigit():
          token = Token(line_number, 'line number', False)
        else:
          token = Token(line_number, 'line identification', False)

    return token
Example #12
0
    def get_tokens(self):
        if self.text is None:
            return None

        # split the text into 'TEXT', content, and 'ENDTEXT' tokens
        len_start = len(self.prefix)
        len_end = len(self.suffix)

        starter_token = Token(self.text[:len_start], 'keyword', False)
        ender_token = Token(self.text[-len_end:], 'keyword', False)
        content = Token(self.text[len_start:-len_end], 'string', True)

        return [starter_token, content, ender_token]
Example #13
0
    def get_tokens(self):
        if self.text is None:
            return None

        tokens = None
        if self.text[-1] == '.':
            # a terminating dot is not part of the PIC
            token1 = Token(self.text[:-1], 'picture', True)
            token2 = Token('.', 'statement terminator', False)
            tokens = [token1, token2]
        else:
            tokens = [Token(self.text, 'picture', True)]

        return tokens
Example #14
0
    def convert_identifiers_to_labels(self):
        prev_2_token = Token('\n', 'newline', False)
        prev_token = Token('\n', 'newline', False)

        for token in self.tokens:
            if token.group == 'group' and token.text == ',' and \
              prev_token.group == 'identifier' and \
              prev_2_token.group == 'newline':
                prev_token.group = 'label'
                prev_token.is_operand = False

            if token.group not in ['whitespace']:
                prev_2_token = prev_token
                prev_token = token
Example #15
0
    def calc_operator_3_confidence(self, tokens, num_operators, group_ends,
                                   allow_pairs):
        errors = 0
        prev_token = Token('\n', 'newline', False)

        lower_unary_operators = []
        for op in self.unary_operators:
            lower_unary_operators.append(op.lower())

        for token in tokens:
            prev_token_operand = prev_token.is_operand or \
              prev_token.text in group_ends or \
              prev_token.text.lower() == 'end'

            if token.group == 'operator' and \
              not prev_token_operand and \
              prev_token.text not in self.adjective_operators and \
              prev_token.text not in self.postfix_operators and \
              token.text not in self.keyword_postfix and \
              token.text.lower() not in lower_unary_operators and \
              [prev_token.text, token.text] not in allow_pairs:
                errors += 1
                self.errors.append({
                    'TYPE': 'OPERATOR3',
                    'FIRST': prev_token.text,
                    'SECOND': token.text
                })

            prev_token = token

        operator_confidence_3 = 1.0 - errors / num_operators

        self.confidences['operator_3'] = operator_confidence_3
Example #16
0
    def calc_operator_2_confidence(self, tokens, num_operators, allow_pairs):
        errors = 0
        prev_token = Token('\n', 'newline', False)

        lower_unary_operators = []
        for op in self.unary_operators:
            lower_unary_operators.append(op.lower())

        for token in tokens:
            if token.group == 'operator' and \
              prev_token.group == 'operator' and \
              prev_token.text not in self.adjective_operators and \
              prev_token.text not in self.postfix_operators and \
              token.text.lower() not in lower_unary_operators and \
              [prev_token.text, token.text] not in allow_pairs:
                errors += 1
                self.errors.append({
                    'TYPE': 'OPERATOR2',
                    'FIRST': prev_token.text,
                    'SECOND': token.text
                })

            prev_token = token

        operator_confidence_2 = 1.0 - errors / num_operators

        self.confidences['operator_2'] = operator_confidence_2
Example #17
0
    def calc_token_2_confidence(self, allowed_tokens=[]):
        num_repeated_tokens = 0
        prev_token = Token('\n', 'newline', False)

        allowed_groups = [
            'invalid', 'whitespace', 'newline', 'comment', 'line description',
            'group'
        ]
        for token in self.tokens:
            if token.group not in allowed_groups and token.text not in allowed_tokens:
                if token.group == prev_token.group and token.text == prev_token.text:
                    num_repeated_tokens += 1

                    self.errors.append({
                        'TYPE': 'TOKEN',
                        'REPEATED': token.text,
                        'FIRST': '',
                        'SECOND': ''
                    })

            prev_token = token

        repeat_confidence = 1.0

        if len(self.tokens) > 0:
            repeat_unconfidence = num_repeated_tokens / len(self.tokens)
            repeat_confidence = 1.0 - repeat_unconfidence

        self.confidences['repeated token'] = repeat_confidence
Example #18
0
    def combine_identifier_colon(tokens, separator_groups, separator_texts,
                                 ignore_groups):
        new_list = []

        new_token = None
        first_printable_token = True

        for token in tokens:
            if token.text == ':' and \
              new_token is not None and new_token.group == 'identifier' and \
              first_printable_token:
                new_token = Token(new_token.text + token.text, 'label', False)
            else:
                if new_token is not None:
                    new_list.append(new_token)
                    if new_token.group in separator_groups or \
                      new_token.text in separator_texts:
                        first_printable_token = True
                    else:
                        if new_token.group not in ignore_groups:
                            first_printable_token = False
                new_token = token

        if new_token is not None:
            new_list.append(new_token)

        return new_list
Example #19
0
    def calc_operator_4_confidence(self, tokens, num_operators, group_starts,
                                   allow_pairs):
        errors = 0
        prev_token = Token('\n', 'newline', False)

        lower_unary_operators = []
        for op in self.unary_operators:
            lower_unary_operators.append(op.lower())

        for token in tokens:
            prev_token_postfix_operator = prev_token.text.lower() in (
                op.lower() for op in self.postfix_operators)

            if prev_token.group == 'operator' and \
              not prev_token_postfix_operator and \
              not token.is_operand and \
              token.text.lower() not in lower_unary_operators and \
              token.text not in group_starts and \
              [prev_token.text, token.text] not in allow_pairs:
                errors += 1
                self.errors.append({
                    'TYPE': 'OPERATOR4',
                    'FIRST': prev_token.text,
                    'SECOND': token.text
                })

            prev_token = token

        operator_confidence_4 = 1.0 - errors / num_operators

        self.confidences['operator_4'] = operator_confidence_4
Example #20
0
    def calc_operand_n_confidence(self, tokens, operand_types, max_count):
        confidence_name_u = 'OPERAND_' + str(max_count)
        confidence_name_l = 'operand_' + str(max_count)
        n_operand_count = 0
        consec_count = 0
        prev_token = Token('\n', 'newline', False)
        for token in tokens:
            if token.group in operand_types:
                consec_count += 1
                if consec_count > max_count:
                    n_operand_count += 1
                    self.errors.append({
                        'TYPE': confidence_name_u,
                        'FIRST': prev_token.text,
                        'SECOND': token.text
                    })
            else:
                consec_count = 0

            prev_token = token

        operand_confidence = 1.0

        if len(tokens) > 0:
            operand_confidence = 1.0 - (n_operand_count / len(tokens))

        self.confidences[confidence_name_l] = operand_confidence
Example #21
0
    def combine_identifier_colon(tokens, separator_groups, separator_texts,
                                 ignore_groups):
        new_list = []

        new_token = None
        first_printable_token = True
        in_declaration = True

        for token in tokens:
            if token.text == ':' and \
              new_token is not None and new_token.group == 'identifier' and \
              first_printable_token and \
              not in_declaration:
                new_token = Token(new_token.text + token.text, 'label', False)
            else:
                if new_token is not None:
                    new_list.append(new_token)
                    if new_token.group in separator_groups or \
                      new_token.text in separator_texts:
                        first_printable_token = True
                    else:
                        if new_token.group not in ignore_groups:
                            first_printable_token = False
                new_token = token

            if token.text.lower() in ['procedure', 'function']:
                in_declaration = True

            if token.text.lower() == 'begin':
                in_declaration = False

        if new_token is not None:
            new_list.append(new_token)

        return new_list
Example #22
0
    def tokenize_asm_code(text, tab_size, opcode_tokenizer, opcode_extras,
                          args_tokenizer, label_leads, label_mids, label_ends,
                          comment_leads, line_comment_leads, use_line_id):
        lines = text.split('\n')

        tokens = []
        indents = []
        previous_was_continued = False

        for line in lines:
            newline = '\n'
            if len(line) > 0 and line[-1] == '\r':
                newline = '\r\n'
            line = line.rstrip('\r')
            line = line.rstrip()
            line = Tokenizer.tabs_to_spaces(line, tab_size)

            # get tokens and indents
            line_tokens, line_indents, was_continued = Tokenizer.tokenize_asm_line(
                line, previous_was_continued, opcode_tokenizer, opcode_extras,
                args_tokenizer, label_leads, label_mids, label_ends,
                comment_leads, line_comment_leads, use_line_id)
            tokens += line_tokens
            indents.append(line_indents)

            tokens.append(Token(newline, 'newline', False))
            previous_was_continued = was_continued

        # return tokens and indents
        return tokens, indents
Example #23
0
    def calc_value_value_different_confidence(self, tokens):
        # remove tokens we don't care about
        drop_types = [
            'whitespace', 'comment', 'line description', 'line continuation'
        ]
        tokens = Examiner.drop_tokens(self.tokens, drop_types)

        value_types = ['number', 'string', 'symbol']

        two_value_count = 0
        prev_token = Token('\n', 'newline', False)
        for token in tokens:
            if token.group in value_types and\
              prev_token.group in value_types and\
              not token.group == prev_token.group:
                two_value_count += 1
                self.errors.append({
                    'TYPE': 'VALUE VALUE DIFFERENT',
                    'FIRST': prev_token.text,
                    'SECOND': token.text
                })

            prev_token = token

        value_value_confidence = 1.0

        if len(tokens) > 0:
            value_value_confidence = 1.0 - (two_value_count / len(tokens))

        self.confidences['value value'] = value_value_confidence
Example #24
0
    def convert_asm_identifiers_to_labels(self):
        prev_token = Token('\n', 'newline', False)

        for token in self.tokens:
            if token.group == 'identifier' and prev_token.group == 'newline':
                token.group = 'label'

            prev_token = token
Example #25
0
    def get_tokens(self):
        if self.text is None:
            return None

        if self.text == '':
            return None

        return [Token(self.text, 'comment', False)]
Example #26
0
    def convert_keywords_to_identifiers(tokens):
        prev_token = Token('\n', 'newline', False)

        for token in tokens:
            if prev_token.text == 'type' and token.group != 'class':
                prev_token.group = 'identifier'

            prev_token = token
Example #27
0
    def get_tokens(self):
        if self.text is None:
            return None

        if self.text.startswith(self.prefix):
            return [Token(self.text, 'preprocessor', False)]

        return None
  def tokenize_line(self, line, tokenizer, wide):
    # break apart the line based on fixed format
    tokens = []

    # The COBOL line format is:
    # 1-6: line number or blank (ignored)
    # 7: space or one of *, /, D, d, $, -
    # 8-71: program text
    # 72-: identification, traditionally sequence number (ignored)

    line_indicator = line[6:7]
    if wide:
      line_text = line[7:]
      line_identification = ''
    else:
      line_text = line[7:71]
      line_identification = line[72:]

    if line.startswith(('//', '/*')):
      tokens.append(Token(line, 'jcl', False))
    else:
      line_number = line[:6]
      token = self.tokenize_line_number(line_number)
      if token is not None:
        tokens.append(token)

      # tokenize the line indicator
      if line_indicator in ['*', '/', 'D', 'd', '$']:
        token = self.tokenize_alt_line(line, line_indicator)
        if token is not None:
          tokens.append(token)
      else:
        token = self.tokenize_line_indicator(line_indicator)
        if token is not None:
          tokens.append(token)

        # tokenize the code
        tokens += tokenizer.tokenize(line_text)

    # tokenize the line identification
    if len(line_identification) > 0:
      tokens.append(Token(line_identification, 'line identification', False))

    tokens.append(Token('\n', 'newline', False))

    return tokens
Example #29
0
    def get_tokens(self):
        if self.text is None:
            return None

        m = re.match(self.regex, self.text)
        if m is not None:
            g = m.groups()
            if len(g) != 4:
                return None

        token1 = Token(g[0], 'keyword', False)
        token2 = Token(g[1], 'whitespace', False)
        token3 = Token(g[2], 'keyword', False)
        token4 = Token(g[3], 'comment', False)

        tokens = [token1, token2, token3, token4]

        return tokens
Example #30
0
    def convert_operators_to_identifiers(self):
        prev_token = Token('\n', 'newline', False)

        for token in self.tokens:
            if token.text == '*' and prev_token.text == '::':
                token.group = 'identifier'

            if token.group not in ['whitespace', 'comment', 'newline']:
                prev_token = token