Beispiel #1
0
class FeatureGenerator(object):
    """
        @param:
            record -> piece of raw_text, or a list of tokens
    """
    def __init__(self, feature_for_separate_model=False):
        super(FeatureGenerator, self).__init__()
        self.dictionary = enchant.Dict('en_US')
        self.token_generator = Tokens()     # Connection established!
        self.record = None
        self.tokens = []
        self.features = None      # list of list of features for every name; e.g. [[1,1,1,1],[...], ...]

        
        # Regex setup
        self.NUM_REGEX = re.compile('\d')
        self.CHAR_DIGIT_MIX_REGEX = re.compile('((^[a-zA-Z]+\d{4}$)|(^[a-zA-Z]+\d{2}$))|((^\d{4}[a-zA-Z]+$)|(^\d{2}[a-zA-Z]+$))', re.MULTILINE)
        self.NAME_ABBREV_REGEX = re.compile('([A-Z]\.-[A-Z]\.)|([A-Z]\.-[A-Z])|([A-Z]\.-)|(([A-Z]\.)+)|(O\'[A-Z][a-z]+)')   #C.P.; C.-C.; O'Reilly
        self.PAGE_NO_REGEX = re.compile('\d+-\d+')

        # Gazzatte setup
        self.DELIMITERS = [',', '.', ]
        self.LBRACKET = ['(', '[', '{', '<', ]
        self.RBRACKET = [')', ']', '}', '>', ]
        self.APOSTROPHES = ["'s", "'re", "'d", ]
        self.QUOTATIONS = ['"', "''", "``", ]
        self.MONTHS = ['Janurary', 'February', 'March', 'April','May','June','July','August','September','October','November','December']
        self.NAME_LIST = [item.strip() for item in open('data/name.lst','r').readlines()]
        self.VENUE_LIST = [item.strip() for item in open('data/venue.lst','r').readlines()]
        self.ORDINAL_LIST = [item.strip() for item in open('data/ordinal.lst','r').readlines()]
        # self.CITY_LIST = [item.strip() for item in open('data/cities.lst','r').readlines()]
        self.COUNTRY_LIST = [item.strip() for item in open('data/countries.lst','r').readlines()]

        if feature_for_separate_model:
            self.pipeline = PARTIAL_PIPELINE
        else:
            self.pipeline = STANDARD_PIPELINE

    def close_connection(self):
        self.token_generator.close_connection()


    def build(self, record):
        self.record = record

        features = []
        need_tokenize = True
        if type(self.record) is list:
            need_tokenize = False
        else:
            need_tokenize = True

        # record raw texts
        if need_tokenize:
            response_obj = self.token_generator.tokenize(self.record)
            self.tokens = response_obj['tokens']

        # Already tokenized input
        else:   
            self.tokens = self.record

        self.num_tokens = len(self.tokens)  # count how many tokens are there in this piece of text.

        for i in range(self.num_tokens):
            sub_features = []
            for pipe in self.pipeline:
                action = getattr(self, pipe)
                sub_features.append(action(i))
            features.append(sub_features)
        self.features = features

        return features

    def token_length(self, record):
        return self.token_generator.token_length(record)


    def print_features(self):
        for i in range(self.num_tokens):
            print self.features[i], '\t\t', self.tokens[i]


    ################################### Feature functions ###################################
    # Feature output format:
    # [
    #   [([1,0,0,1], 1), ([1,1,1,1], 0), (...)...], <-- One piece of training sample (x, y) where x=x1x2x3...xm, y=y1y2y3...ym <-- a sentence representation in feature vectors, in sequence
    #   [.......................],  <-- another sentence, parallel with the previous sentence, independent processed
    #   ...
    # ]
    # Assume segment is space-delimited, so it's a feature for the segmentm challenge will be tokenizing
    ################################### Local Features #####################################

    # C.B. or C.-C 
    def f_is_name_abbrev(self, idx):
        token = self.tokens[idx]
        if self.NAME_ABBREV_REGEX.match(token) is None:
            return 0
        return 1

    def f_is_apostrophes(self, idx):
        token = self.tokens[idx]
        return int(token in self.APOSTROPHES)

    def f_is_capitalized(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(token[0].isupper())

    def f_is_all_upper(self, idx):
        token = self.tokens[idx]
        if len(token) <= 2:
            return 0
        return int(token.isupper())

    def f_is_english(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(self.dictionary.check(token.lower()) and len(token) > 1)

    def f_has_both_char_and_digit(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        if self.CHAR_DIGIT_MIX_REGEX.search(token) is None:
            return 0
        return 1

    def f_is_delimiter(self, idx):
        token = self.tokens[idx]
        if len(token) != 1:
            return 0
        return int(token in self.DELIMITERS)

    def f_is_quotation(self, idx):
        token = self.tokens[idx]
        return int(token in self.QUOTATIONS)

    def f_is_punctuation(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(len(token) == 1 and token in punctuation)

    # def f_is_sequential_punctuation(self, idx): #e.g. K.C.-C. Chang
    #     token = self.tokens[idx]
    #     if len(token) <= 1:
    #         return 0
    #     ret = 1
    #     for t in token:
    #         if t not in punctuation:
    #             ret = 0
    #             break
    #     return ret

    def f_has_digit(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        if self.NUM_REGEX.search(token) is None:
            return 0
        return 1

    def f_is_all_digit(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(token.isdigit())

    def f_is_possible_page_number(self, idx):
        token = self.tokens[idx]
        if self.PAGE_NO_REGEX.match(token) is None:
            return 0
        return 1

    def f_is_month(self, idx):
        token = self.tokens[idx]
        return int(token in self.MONTHS)

    def f_is_possible_year(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(token.isdigit() and len(token)==4 and int(token)>= 1980 and int(token)<=datetime.now().year)




    ################################### Dictionary Features ################################
    def f_is_in_namelist(self, idx):
        token = self.tokens[idx].encode('ascii', 'ignore')
        if len(token) == 0:
            return 0
        return int(token.lower().strip() in self.NAME_LIST)

    def f_is_ordinal(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(token.lower().strip() in self.ORDINAL_LIST)


    # Also handled some of the common venue tokens that are also common in English???? 
    # TODO: more delicate 
    def f_is_in_venuelist(self, idx):
        token = self.tokens[idx].encode('ascii', 'ignore')
        if len(token) == 0:
            return 0
        if (idx-1) >= 0:
            prev_token = self.tokens[idx-1]
        else:
            prev_token = ''

        # Special case handling
        if token.strip() in ['In', 'Appear', 'Appears', 'Appeared', ] and len(prev_token)>0 and prev_token in ['.', ',', ';', '(', ]:
            return 1

        return int(token.lower().strip() in (self.VENUE_LIST + self.ORDINAL_LIST + self.COUNTRY_LIST) )


    ################################### Global Features ####################################

    def f_has_lbracket_before(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        if (idx-1) >= 0:
            prev_token = self.tokens[idx-1]
        else:
            return 0
        return int( prev_token in self.LBRACKET )

    def f_has_rbracket_after(self, idx):
        token = self.tokens[idx]
        if (idx+1) < self.num_tokens:
            next_token = self.tokens[idx+1]
        else:
            return 0
        return int( next_token in self.RBRACKET )

    def f_has_quotation_before(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        if (idx-1) >= 0:
            prev_token = self.tokens[idx-1]
        else:
            return 0
        return int( prev_token in self.QUOTATIONS )

    def f_has_quotation_after(self, idx):
        token = self.tokens[idx]
        if (idx+1) < self.num_tokens:
            next_token = self.tokens[idx+1]
        else:
            return 0
        return int( next_token in self.QUOTATIONS )


    #????
    def f_is_possible_volume(self, idx):
        token = self.tokens[idx]
        if ((idx-1) >=0) and ((idx+1)<self.num_tokens):
            prev_token = self.tokens[idx-1]
            next_token = self.tokens[idx+1]
            return int(prev_token in self.LBRACKET and next_token in self.RBRACKET and token.isdigit())
        else:
            return 0

    # ???? necessary?
    def f_is_at_second_half_of_string(self, idx):
        token = self.tokens[idx]
        return int(idx > self.num_tokens/2)

    def f_has_delimiter_before(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        if (idx-1) >= 0:
            prev_token = self.tokens[idx-1]
        else:
            return 0
        return int(len(prev_token)==1 and prev_token in self.DELIMITERS)

    def f_has_delimiter_after(self, idx):
        token = self.tokens[idx]
        if (idx+1) < self.num_tokens:
            next_token = self.tokens[idx+1]
        else:
            return 0
        return int( len(next_token)==1 and next_token in self.DELIMITERS)

    #????
    def f_is_an_and_between_two_names(self, idx):
        token = self.tokens[idx]
        if (idx+1) < self.num_tokens and (idx-1)>=0:
            next_token = self.tokens[idx+1]
            prev_token = self.tokens[idx-1]
        else:
            return 0
        return int(token.strip().lower()=='and' and self.f_is_capitalized(idx-1) and (self.f_is_english(idx-1)==0))

    def f_is_followed_by_year(self, idx):
        token = self.tokens[idx]
        if (idx+1) < self.num_tokens:
            next_token = self.tokens[idx+1]
        else:
            return 0
        return int((len(next_token)==2 or len(next_token)==4) and next_token.isdigit() and not token.isdigit())

    # Addressing the possible new notions in the title of publications
    def f_is_possible_new_notion(self, idx):
        token = self.tokens[idx]
        if (idx+2) < self.num_tokens:
            next_token = self.tokens[idx+1]
            next_next_token = self.tokens[idx+2]
        else:
            return 0
        p1 = re.compile(r'^[A-Z][a-z0-9]+[A-Z][a-z0-9]+$', re.MULTILINE)
        p2 = re.compile(r'^[A-Z][a-z0-9]+$', re.MULTILINE)
        p3 = re.compile(r'^[A-Z][a-z0-9]+[A-Z][a-z0-9]+[A-Z][a-z0-9]+$', re.MULTILINE)
        p4 = re.compile(r'^[a-z0-9]+$', re.MULTILINE)
        p5 = re.compile(r'[A-Z]*[A-Za-z]+-[A-Za-z]+')    #specific terminology ???? content-aware; Group-By
        # Xxxxxx, XxxxxXxxxx, XxxxXxxxXxxx, Xxxx xxxx, Xxxx Xxxx, XXXX

        pattern_1 = token.isupper() and next_token==':'
        pattern_2 = (p1.match(token) is not None) and next_token==':'
        pattern_3 = (p2.match(token) is not None) and next_token==':'
        pattern_4 = (p3.match(token) is not None) and next_token==':'
        pattern_5 = (p2.match(token) is not None) and (p2.match(next_token) is not None) and next_next_token==':'
        pattern_6 = (p2.match(token) is not None) and (p4.match(next_token) is not None) and next_next_token==':'
        pattern_7 = p5.match(token) is not None

        return int(pattern_1 or pattern_2 or pattern_3 or pattern_4 or pattern_5 or pattern_6 or pattern_7)


    def f_is_possible_boundary(self, idx):  #check if period.  Pending feature
        token = self.tokens[idx]
        if (idx+1) < self.num_tokens and (idx-1)>=0:
            next_token = self.tokens[idx+1]
            prev_token = self.tokens[idx-1]
        else:
            return 0

        return int( (token == '.' and prev_token.islower() and next_token[0].isupper()) or 
                    (token[-1]=='.' and token[0].islower() and next_token[0].isupper()) 
                  )
Beispiel #2
0
class FeatureGenerator(object):
    """
        @param:
            record -> piece of raw_text, or a list of tokens
    """
    def __init__(self, feature_for_separate_model=False):
        super(FeatureGenerator, self).__init__()
        self.dictionary = enchant.Dict('en_US')
        self.token_generator = Tokens()  # Connection established!
        self.record = None
        self.tokens = []
        self.features = None  # list of list of features for every name; e.g. [[1,1,1,1],[...], ...]

        # Regex setup
        self.NUM_REGEX = re.compile('\d')
        self.CHAR_DIGIT_MIX_REGEX = re.compile(
            '((^[a-zA-Z]+\d{4}$)|(^[a-zA-Z]+\d{2}$))|((^\d{4}[a-zA-Z]+$)|(^\d{2}[a-zA-Z]+$))',
            re.MULTILINE)
        self.NAME_ABBREV_REGEX = re.compile(
            '([A-Z]\.-[A-Z]\.)|([A-Z]\.-[A-Z])|([A-Z]\.-)|(([A-Z]\.)+)|(O\'[A-Z][a-z]+)'
        )  #C.P.; C.-C.; O'Reilly
        self.PAGE_NO_REGEX = re.compile('\d+-\d+')

        # Gazzatte setup
        self.DELIMITERS = [
            ',',
            '.',
        ]
        self.LBRACKET = [
            '(',
            '[',
            '{',
            '<',
        ]
        self.RBRACKET = [
            ')',
            ']',
            '}',
            '>',
        ]
        self.APOSTROPHES = [
            "'s",
            "'re",
            "'d",
        ]
        self.QUOTATIONS = [
            '"',
            "''",
            "``",
        ]
        self.MONTHS = [
            'Janurary', 'February', 'March', 'April', 'May', 'June', 'July',
            'August', 'September', 'October', 'November', 'December'
        ]
        self.NAME_LIST = [
            item.strip() for item in open('data/name.lst', 'r').readlines()
        ]
        self.VENUE_LIST = [
            item.strip() for item in open('data/venue.lst', 'r').readlines()
        ]
        self.ORDINAL_LIST = [
            item.strip() for item in open('data/ordinal.lst', 'r').readlines()
        ]
        # self.CITY_LIST = [item.strip() for item in open('data/cities.lst','r').readlines()]
        self.COUNTRY_LIST = [
            item.strip()
            for item in open('data/countries.lst', 'r').readlines()
        ]

        if feature_for_separate_model:
            self.pipeline = PARTIAL_PIPELINE
        else:
            self.pipeline = STANDARD_PIPELINE

    def close_connection(self):
        self.token_generator.close_connection()

    def build(self, record):
        self.record = record

        features = []
        need_tokenize = True
        if type(self.record) is list:
            need_tokenize = False
        else:
            need_tokenize = True

        # record raw texts
        if need_tokenize:
            response_obj = self.token_generator.tokenize(self.record)
            self.tokens = response_obj['tokens']

        # Already tokenized input
        else:
            self.tokens = self.record

        self.num_tokens = len(
            self.tokens
        )  # count how many tokens are there in this piece of text.

        for i in range(self.num_tokens):
            sub_features = []
            for pipe in self.pipeline:
                action = getattr(self, pipe)
                sub_features.append(action(i))
            features.append(sub_features)
        self.features = features

        return features

    def token_length(self, record):
        return self.token_generator.token_length(record)

    def print_features(self):
        for i in range(self.num_tokens):
            print self.features[i], '\t\t', self.tokens[i]

    ################################### Feature functions ###################################
    # Feature output format:
    # [
    #   [([1,0,0,1], 1), ([1,1,1,1], 0), (...)...], <-- One piece of training sample (x, y) where x=x1x2x3...xm, y=y1y2y3...ym <-- a sentence representation in feature vectors, in sequence
    #   [.......................],  <-- another sentence, parallel with the previous sentence, independent processed
    #   ...
    # ]
    # Assume segment is space-delimited, so it's a feature for the segmentm challenge will be tokenizing
    ################################### Local Features #####################################

    # C.B. or C.-C
    def f_is_name_abbrev(self, idx):
        token = self.tokens[idx]
        if self.NAME_ABBREV_REGEX.match(token) is None:
            return 0
        return 1

    def f_is_apostrophes(self, idx):
        token = self.tokens[idx]
        return int(token in self.APOSTROPHES)

    def f_is_capitalized(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(token[0].isupper())

    def f_is_all_upper(self, idx):
        token = self.tokens[idx]
        if len(token) <= 2:
            return 0
        return int(token.isupper())

    def f_is_english(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(self.dictionary.check(token.lower()) and len(token) > 1)

    def f_has_both_char_and_digit(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        if self.CHAR_DIGIT_MIX_REGEX.search(token) is None:
            return 0
        return 1

    def f_is_delimiter(self, idx):
        token = self.tokens[idx]
        if len(token) != 1:
            return 0
        return int(token in self.DELIMITERS)

    def f_is_quotation(self, idx):
        token = self.tokens[idx]
        return int(token in self.QUOTATIONS)

    def f_is_punctuation(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(len(token) == 1 and token in punctuation)

    # def f_is_sequential_punctuation(self, idx): #e.g. K.C.-C. Chang
    #     token = self.tokens[idx]
    #     if len(token) <= 1:
    #         return 0
    #     ret = 1
    #     for t in token:
    #         if t not in punctuation:
    #             ret = 0
    #             break
    #     return ret

    def f_has_digit(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        if self.NUM_REGEX.search(token) is None:
            return 0
        return 1

    def f_is_all_digit(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(token.isdigit())

    def f_is_possible_page_number(self, idx):
        token = self.tokens[idx]
        if self.PAGE_NO_REGEX.match(token) is None:
            return 0
        return 1

    def f_is_month(self, idx):
        token = self.tokens[idx]
        return int(token in self.MONTHS)

    def f_is_possible_year(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(token.isdigit() and len(token) == 4 and int(token) >= 1980
                   and int(token) <= datetime.now().year)

    ################################### Dictionary Features ################################
    def f_is_in_namelist(self, idx):
        token = self.tokens[idx].encode('ascii', 'ignore')
        if len(token) == 0:
            return 0
        return int(token.lower().strip() in self.NAME_LIST)

    def f_is_ordinal(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(token.lower().strip() in self.ORDINAL_LIST)

    # Also handled some of the common venue tokens that are also common in English????
    # TODO: more delicate
    def f_is_in_venuelist(self, idx):
        token = self.tokens[idx].encode('ascii', 'ignore')
        if len(token) == 0:
            return 0
        if (idx - 1) >= 0:
            prev_token = self.tokens[idx - 1]
        else:
            prev_token = ''

        # Special case handling
        if token.strip() in [
                'In',
                'Appear',
                'Appears',
                'Appeared',
        ] and len(prev_token) > 0 and prev_token in [
                '.',
                ',',
                ';',
                '(',
        ]:
            return 1

        return int(token.lower().strip() in (self.VENUE_LIST +
                                             self.ORDINAL_LIST +
                                             self.COUNTRY_LIST))

    ################################### Global Features ####################################

    def f_has_lbracket_before(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        if (idx - 1) >= 0:
            prev_token = self.tokens[idx - 1]
        else:
            return 0
        return int(prev_token in self.LBRACKET)

    def f_has_rbracket_after(self, idx):
        token = self.tokens[idx]
        if (idx + 1) < self.num_tokens:
            next_token = self.tokens[idx + 1]
        else:
            return 0
        return int(next_token in self.RBRACKET)

    def f_has_quotation_before(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        if (idx - 1) >= 0:
            prev_token = self.tokens[idx - 1]
        else:
            return 0
        return int(prev_token in self.QUOTATIONS)

    def f_has_quotation_after(self, idx):
        token = self.tokens[idx]
        if (idx + 1) < self.num_tokens:
            next_token = self.tokens[idx + 1]
        else:
            return 0
        return int(next_token in self.QUOTATIONS)

    #????
    def f_is_possible_volume(self, idx):
        token = self.tokens[idx]
        if ((idx - 1) >= 0) and ((idx + 1) < self.num_tokens):
            prev_token = self.tokens[idx - 1]
            next_token = self.tokens[idx + 1]
            return int(prev_token in self.LBRACKET
                       and next_token in self.RBRACKET and token.isdigit())
        else:
            return 0

    # ???? necessary?
    def f_is_at_second_half_of_string(self, idx):
        token = self.tokens[idx]
        return int(idx > self.num_tokens / 2)

    def f_has_delimiter_before(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        if (idx - 1) >= 0:
            prev_token = self.tokens[idx - 1]
        else:
            return 0
        return int(len(prev_token) == 1 and prev_token in self.DELIMITERS)

    def f_has_delimiter_after(self, idx):
        token = self.tokens[idx]
        if (idx + 1) < self.num_tokens:
            next_token = self.tokens[idx + 1]
        else:
            return 0
        return int(len(next_token) == 1 and next_token in self.DELIMITERS)

    #????
    def f_is_an_and_between_two_names(self, idx):
        token = self.tokens[idx]
        if (idx + 1) < self.num_tokens and (idx - 1) >= 0:
            next_token = self.tokens[idx + 1]
            prev_token = self.tokens[idx - 1]
        else:
            return 0
        return int(token.strip().lower() == 'and'
                   and self.f_is_capitalized(idx - 1)
                   and (self.f_is_english(idx - 1) == 0))

    def f_is_followed_by_year(self, idx):
        token = self.tokens[idx]
        if (idx + 1) < self.num_tokens:
            next_token = self.tokens[idx + 1]
        else:
            return 0
        return int((len(next_token) == 2 or len(next_token) == 4)
                   and next_token.isdigit() and not token.isdigit())

    # Addressing the possible new notions in the title of publications
    def f_is_possible_new_notion(self, idx):
        token = self.tokens[idx]
        if (idx + 2) < self.num_tokens:
            next_token = self.tokens[idx + 1]
            next_next_token = self.tokens[idx + 2]
        else:
            return 0
        p1 = re.compile(r'^[A-Z][a-z0-9]+[A-Z][a-z0-9]+$', re.MULTILINE)
        p2 = re.compile(r'^[A-Z][a-z0-9]+$', re.MULTILINE)
        p3 = re.compile(r'^[A-Z][a-z0-9]+[A-Z][a-z0-9]+[A-Z][a-z0-9]+$',
                        re.MULTILINE)
        p4 = re.compile(r'^[a-z0-9]+$', re.MULTILINE)
        p5 = re.compile(r'[A-Z]*[A-Za-z]+-[A-Za-z]+'
                        )  #specific terminology ???? content-aware; Group-By
        # Xxxxxx, XxxxxXxxxx, XxxxXxxxXxxx, Xxxx xxxx, Xxxx Xxxx, XXXX

        pattern_1 = token.isupper() and next_token == ':'
        pattern_2 = (p1.match(token) is not None) and next_token == ':'
        pattern_3 = (p2.match(token) is not None) and next_token == ':'
        pattern_4 = (p3.match(token) is not None) and next_token == ':'
        pattern_5 = (p2.match(token)
                     is not None) and (p2.match(next_token)
                                       is not None) and next_next_token == ':'
        pattern_6 = (p2.match(token)
                     is not None) and (p4.match(next_token)
                                       is not None) and next_next_token == ':'
        pattern_7 = p5.match(token) is not None

        return int(pattern_1 or pattern_2 or pattern_3 or pattern_4
                   or pattern_5 or pattern_6 or pattern_7)

    def f_is_possible_boundary(self, idx):  #check if period.  Pending feature
        token = self.tokens[idx]
        if (idx + 1) < self.num_tokens and (idx - 1) >= 0:
            next_token = self.tokens[idx + 1]
            prev_token = self.tokens[idx - 1]
        else:
            return 0

        return int(
            (token == '.' and prev_token.islower() and next_token[0].isupper())
            or (token[-1] == '.' and token[0].islower()
                and next_token[0].isupper()))