Python Tokens.token_lengthの例

プログラミング言語: Python

名前空間/パッケージ名: tokens

クラス/型: Tokens

メソッド/関数: token_length

hotexamples.comのコード掲載数: 3

Python Tokens.token_length - 3件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのtokens.Tokens.token_lengthの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

Tokens(30)

end(2)

error(2)

expect(2)

get(2)

peek(2)

token_length(2)

tokenize(2)

bot_token(1)

channel(1)

close_connection(1)

edit_query(1)

get_url_token(1)

get_user_token(1)

remove_stopwords(1)

コード例 #1

ファイルを表示

ファイル: feature.py プロジェクト: xiaoyao1991/hmmpy

class FeatureGenerator(object):
    """
        @param:
            record -> piece of raw_text, or a list of tokens
    """
    def __init__(self, feature_for_separate_model=False):
        super(FeatureGenerator, self).__init__()
        self.dictionary = enchant.Dict('en_US')
        self.token_generator = Tokens()     # Connection established!
        self.record = None
        self.tokens = []
        self.features = None      # list of list of features for every name; e.g. [[1,1,1,1],[...], ...]

        
        # Regex setup
        self.NUM_REGEX = re.compile('\d')
        self.CHAR_DIGIT_MIX_REGEX = re.compile('((^[a-zA-Z]+\d{4}$)|(^[a-zA-Z]+\d{2}$))|((^\d{4}[a-zA-Z]+$)|(^\d{2}[a-zA-Z]+$))', re.MULTILINE)
        self.NAME_ABBREV_REGEX = re.compile('([A-Z]\.-[A-Z]\.)|([A-Z]\.-[A-Z])|([A-Z]\.-)|(([A-Z]\.)+)|(O\'[A-Z][a-z]+)')   #C.P.; C.-C.; O'Reilly
        self.PAGE_NO_REGEX = re.compile('\d+-\d+')

        # Gazzatte setup
        self.DELIMITERS = [',', '.', ]
        self.LBRACKET = ['(', '[', '{', '<', ]
        self.RBRACKET = [')', ']', '}', '>', ]
        self.APOSTROPHES = ["'s", "'re", "'d", ]
        self.QUOTATIONS = ['"', "''", "``", ]
        self.MONTHS = ['Janurary', 'February', 'March', 'April','May','June','July','August','September','October','November','December']
        self.NAME_LIST = [item.strip() for item in open('data/name.lst','r').readlines()]
        self.VENUE_LIST = [item.strip() for item in open('data/venue.lst','r').readlines()]
        self.ORDINAL_LIST = [item.strip() for item in open('data/ordinal.lst','r').readlines()]
        # self.CITY_LIST = [item.strip() for item in open('data/cities.lst','r').readlines()]
        self.COUNTRY_LIST = [item.strip() for item in open('data/countries.lst','r').readlines()]

        if feature_for_separate_model:
            self.pipeline = PARTIAL_PIPELINE
        else:
            self.pipeline = STANDARD_PIPELINE

    def close_connection(self):
        self.token_generator.close_connection()


    def build(self, record):
        self.record = record

        features = []
        need_tokenize = True
        if type(self.record) is list:
            need_tokenize = False
        else:
            need_tokenize = True

        # record raw texts
        if need_tokenize:
            response_obj = self.token_generator.tokenize(self.record)
            self.tokens = response_obj['tokens']

        # Already tokenized input
        else:   
            self.tokens = self.record

        self.num_tokens = len(self.tokens)  # count how many tokens are there in this piece of text.

        for i in range(self.num_tokens):
            sub_features = []
            for pipe in self.pipeline:
                action = getattr(self, pipe)
                sub_features.append(action(i))
            features.append(sub_features)
        self.features = features

        return features

    def token_length(self, record):
        return self.token_generator.token_length(record)


    def print_features(self):
        for i in range(self.num_tokens):
            print self.features[i], '\t\t', self.tokens[i]


    ################################### Feature functions ###################################
    # Feature output format:
    # [
    #   [([1,0,0,1], 1), ([1,1,1,1], 0), (...)...], <-- One piece of training sample (x, y) where x=x1x2x3...xm, y=y1y2y3...ym <-- a sentence representation in feature vectors, in sequence
    #   [.......................],  <-- another sentence, parallel with the previous sentence, independent processed
    #   ...
    # ]
    # Assume segment is space-delimited, so it's a feature for the segmentm challenge will be tokenizing
    ################################### Local Features #####################################

    # C.B. or C.-C 
    def f_is_name_abbrev(self, idx):
        token = self.tokens[idx]
        if self.NAME_ABBREV_REGEX.match(token) is None:
            return 0
        return 1

    def f_is_apostrophes(self, idx):
        token = self.tokens[idx]
        return int(token in self.APOSTROPHES)

    def f_is_capitalized(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(token[0].isupper())

    def f_is_all_upper(self, idx):
        token = self.tokens[idx]
        if len(token) <= 2:
            return 0
        return int(token.isupper())

    def f_is_english(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(self.dictionary.check(token.lower()) and len(token) > 1)

    def f_has_both_char_and_digit(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        if self.CHAR_DIGIT_MIX_REGEX.search(token) is None:
            return 0
        return 1

    def f_is_delimiter(self, idx):
        token = self.tokens[idx]
        if len(token) != 1:
            return 0
        return int(token in self.DELIMITERS)

    def f_is_quotation(self, idx):
        token = self.tokens[idx]
        return int(token in self.QUOTATIONS)

    def f_is_punctuation(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(len(token) == 1 and token in punctuation)

    # def f_is_sequential_punctuation(self, idx): #e.g. K.C.-C. Chang
    #     token = self.tokens[idx]
    #     if len(token) <= 1:
    #         return 0
    #     ret = 1
    #     for t in token:
    #         if t not in punctuation:
    #             ret = 0
    #             break
    #     return ret

    def f_has_digit(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        if self.NUM_REGEX.search(token) is None:
            return 0
        return 1

    def f_is_all_digit(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(token.isdigit())

    def f_is_possible_page_number(self, idx):
        token = self.tokens[idx]
        if self.PAGE_NO_REGEX.match(token) is None:
            return 0
        return 1

    def f_is_month(self, idx):
        token = self.tokens[idx]
        return int(token in self.MONTHS)

    def f_is_possible_year(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(token.isdigit() and len(token)==4 and int(token)>= 1980 and int(token)<=datetime.now().year)




    ################################### Dictionary Features ################################
    def f_is_in_namelist(self, idx):
        token = self.tokens[idx].encode('ascii', 'ignore')
        if len(token) == 0:
            return 0
        return int(token.lower().strip() in self.NAME_LIST)

    def f_is_ordinal(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(token.lower().strip() in self.ORDINAL_LIST)


    # Also handled some of the common venue tokens that are also common in English???? 
    # TODO: more delicate 
    def f_is_in_venuelist(self, idx):
        token = self.tokens[idx].encode('ascii', 'ignore')
        if len(token) == 0:
            return 0
        if (idx-1) >= 0:
            prev_token = self.tokens[idx-1]
        else:
            prev_token = ''

        # Special case handling
        if token.strip() in ['In', 'Appear', 'Appears', 'Appeared', ] and len(prev_token)>0 and prev_token in ['.', ',', ';', '(', ]:
            return 1

        return int(token.lower().strip() in (self.VENUE_LIST + self.ORDINAL_LIST + self.COUNTRY_LIST) )


    ################################### Global Features ####################################

    def f_has_lbracket_before(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        if (idx-1) >= 0:
            prev_token = self.tokens[idx-1]
        else:
            return 0
        return int( prev_token in self.LBRACKET )

    def f_has_rbracket_after(self, idx):
        token = self.tokens[idx]
        if (idx+1) < self.num_tokens:
            next_token = self.tokens[idx+1]
        else:
            return 0
        return int( next_token in self.RBRACKET )

    def f_has_quotation_before(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        if (idx-1) >= 0:
            prev_token = self.tokens[idx-1]
        else:
            return 0
        return int( prev_token in self.QUOTATIONS )

    def f_has_quotation_after(self, idx):
        token = self.tokens[idx]
        if (idx+1) < self.num_tokens:
            next_token = self.tokens[idx+1]
        else:
            return 0
        return int( next_token in self.QUOTATIONS )


    #????
    def f_is_possible_volume(self, idx):
        token = self.tokens[idx]
        if ((idx-1) >=0) and ((idx+1)<self.num_tokens):
            prev_token = self.tokens[idx-1]
            next_token = self.tokens[idx+1]
            return int(prev_token in self.LBRACKET and next_token in self.RBRACKET and token.isdigit())
        else:
            return 0

    # ???? necessary?
    def f_is_at_second_half_of_string(self, idx):
        token = self.tokens[idx]
        return int(idx > self.num_tokens/2)

    def f_has_delimiter_before(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        if (idx-1) >= 0:
            prev_token = self.tokens[idx-1]
        else:
            return 0
        return int(len(prev_token)==1 and prev_token in self.DELIMITERS)

    def f_has_delimiter_after(self, idx):
        token = self.tokens[idx]
        if (idx+1) < self.num_tokens:
            next_token = self.tokens[idx+1]
        else:
            return 0
        return int( len(next_token)==1 and next_token in self.DELIMITERS)

    #????
    def f_is_an_and_between_two_names(self, idx):
        token = self.tokens[idx]
        if (idx+1) < self.num_tokens and (idx-1)>=0:
            next_token = self.tokens[idx+1]
            prev_token = self.tokens[idx-1]
        else:
            return 0
        return int(token.strip().lower()=='and' and self.f_is_capitalized(idx-1) and (self.f_is_english(idx-1)==0))

    def f_is_followed_by_year(self, idx):
        token = self.tokens[idx]
        if (idx+1) < self.num_tokens:
            next_token = self.tokens[idx+1]
        else:
            return 0
        return int((len(next_token)==2 or len(next_token)==4) and next_token.isdigit() and not token.isdigit())

    # Addressing the possible new notions in the title of publications
    def f_is_possible_new_notion(self, idx):
        token = self.tokens[idx]
        if (idx+2) < self.num_tokens:
            next_token = self.tokens[idx+1]
            next_next_token = self.tokens[idx+2]
        else:
            return 0
        p1 = re.compile(r'^[A-Z][a-z0-9]+[A-Z][a-z0-9]+$', re.MULTILINE)
        p2 = re.compile(r'^[A-Z][a-z0-9]+$', re.MULTILINE)
        p3 = re.compile(r'^[A-Z][a-z0-9]+[A-Z][a-z0-9]+[A-Z][a-z0-9]+$', re.MULTILINE)
        p4 = re.compile(r'^[a-z0-9]+$', re.MULTILINE)
        p5 = re.compile(r'[A-Z]*[A-Za-z]+-[A-Za-z]+')    #specific terminology ???? content-aware; Group-By
        # Xxxxxx, XxxxxXxxxx, XxxxXxxxXxxx, Xxxx xxxx, Xxxx Xxxx, XXXX

        pattern_1 = token.isupper() and next_token==':'
        pattern_2 = (p1.match(token) is not None) and next_token==':'
        pattern_3 = (p2.match(token) is not None) and next_token==':'
        pattern_4 = (p3.match(token) is not None) and next_token==':'
        pattern_5 = (p2.match(token) is not None) and (p2.match(next_token) is not None) and next_next_token==':'
        pattern_6 = (p2.match(token) is not None) and (p4.match(next_token) is not None) and next_next_token==':'
        pattern_7 = p5.match(token) is not None

        return int(pattern_1 or pattern_2 or pattern_3 or pattern_4 or pattern_5 or pattern_6 or pattern_7)


    def f_is_possible_boundary(self, idx):  #check if period.  Pending feature
        token = self.tokens[idx]
        if (idx+1) < self.num_tokens and (idx-1)>=0:
            next_token = self.tokens[idx+1]
            prev_token = self.tokens[idx-1]
        else:
            return 0

        return int( (token == '.' and prev_token.islower() and next_token[0].isupper()) or 
                    (token[-1]=='.' and token[0].islower() and next_token[0].isupper()) 
                  )

コード例 #2

ファイルを表示

ファイル: training_set_generator.py プロジェクト: xiaoyao1991/hmmpy

def get_training_samples(url):
    log_err('\tGetting Training sample')
    raw_results = router(url)
    log_err('\tData retrieved. Preprocessing...')
    observation_list = []
    label_list = []
    records = []

    feature_generator = FeatureGenerator()
    token_generator = Tokens()

    for raw_result in raw_results:
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []

        authors = raw_result['authors']
        title = raw_result['title']        
        title_copy = raw_result['title']

        try:
            venue = raw_result['conference name']
            venue_copy = raw_result['conference name']
        except:
            venue = ''
            venue_copy = ''
        try:
            venue = raw_result['journal name']
            venue_copy = raw_result['journal name']
        except:
            venue = ''
            venue_copy = ''

        if len(venue) > 0:
            try:
                volume = raw_result['volume']
            except:
                volume = ''
            try:
                issue = raw_result['issue']
            except:
                issue = ''
            try:
                page = raw_result['page']
            except:
                page = ''

            venue += ' ' + volume + ' ' + issue + ' ' + page
            venue_copy += ' ' + volume + ' ' + issue + ' ' + page


        date = raw_result['publication date'][:4]

        # FN: 0
        # LN: 1
        # DL: 2
        # TI: 3
        # VN: 4
        # DT: 5

        # Author -> Title -> ...
        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author += ' , '
            tmp_record += author
            tmp_label_list += [0] * (feature_generator.token_length(author)-2)
            tmp_label_list += [1,2]
                
        # title
        title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)

        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        # title
        # title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author += ' , '
            tmp_record += author
            tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
            tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))



        #=================================Variations of authors=================================
        # Changing order, inserting dot, and probably insert comma as delimiter inside of names
        # This part of variations is very sensitive to what sample source to choose from,
        # for example, Google scholar is the current source of samples, and on gscholar, 
        # most names are in format of JW Han.  <-- Prior knowledge
        # Read more Learn more Change the Globe !!!
        log_err('\tGenerating multiple cases for name variations... ')
        # ================================A. B
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        # authors
        for author in authors:
            if len(author) == 0:
                continue

            #???? BUG!!!! split() doesn't mean tokenization
            author_tokens = token_generator.tokenize(author)['tokens']  # Split the author in order tokens
            if len(author_tokens) == 1:     # Cannot change order or anything, so leave this name alone, and pass to the next name
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Insert dot
                author = author_tokens[0] + '.' + author_tokens[1] + ' , '  # A. B
                tmp_token_length = token_generator.token_length(author)
                tmp_record += author
                tmp_label_list += [0]*(tmp_token_length-2) + [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        # title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']  # Split the author in order to
            if len(author_tokens) == 1:     # Cannot change order or anything, so leave this name alone, and pass to the next name
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Insert dot
                author = author_tokens[0] + '.' + author_tokens[1] + ' , '  # A. B
                tmp_token_length = token_generator.token_length(author)
                tmp_record += author
                tmp_label_list += [0]*(tmp_token_length-2) + [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # ================================B, 
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Only keep lastname
                author = author_tokens[1] + ' , '  # B
                tmp_record += author
                tmp_label_list += [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        # title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Only keep lastname
                author = author_tokens[1] + ' , '  # B
                tmp_record += author
                tmp_label_list += [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))



        # ================================B A., 
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '.,'  # B A.,
                tmp_record += author
                tmp_label_list += [1,0,0,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        # title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '.,'  # B A.,
                tmp_record += author
                tmp_label_list += [1,0,0,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # ================================B A.
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '. '  # B A.
                tmp_record += author
                tmp_label_list += [1,0,0]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        # title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '. '  # B A.
                tmp_record += author
                tmp_label_list += [1,0,0]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))






        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        # Period Case!!!
        log_err('\tGenerating multiple cases for period as DL... ')
        # Author -> Title -> ...
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author += ' , '
            tmp_record += author
            tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
            tmp_label_list += [1,2]
                
        # title
        title = title_copy + ' . '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            venue = venue_copy + ' . '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))



        # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author += ' , '
            tmp_record += author
            tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
            tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # ================================A. B
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        # authors
        for author in authors:
            if len(author) == 0:
                continue

            author_tokens = token_generator.tokenize(author)['tokens']  # Split the author in order tokens
            if len(author_tokens) == 1:     # Cannot change order or anything, so leave this name alone, and pass to the next name
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Insert dot
                author = author_tokens[0] + '.' + author_tokens[1] + ' , '  # A. B
                tmp_token_length = token_generator.token_length(author)
                tmp_record += author
                tmp_label_list += [0]*(tmp_token_length-2) + [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']  # Split the author in order to
            if len(author_tokens) == 1:     # Cannot change order or anything, so leave this name alone, and pass to the next name
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Insert dot
                author = author_tokens[0] + '.' + author_tokens[1] + ' , '  # A. B
                tmp_token_length = token_generator.token_length(author)
                tmp_record += author
                tmp_label_list += [0]*(tmp_token_length-2) + [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # ================================B, 
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Only keep lastname
                author = author_tokens[1] + ' , '  # B
                tmp_record += author
                tmp_label_list += [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Only keep lastname
                author = author_tokens[1] + ' , '  # B
                tmp_record += author
                tmp_label_list += [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))



        # ================================B A., 
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '.,'  # B A.,
                tmp_record += author
                tmp_label_list += [1,0,0,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '.,'  # B A.,
                tmp_record += author
                tmp_label_list += [1,0,0,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

       
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # ================================B A.
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '. '  # B A.
                tmp_record += author
                tmp_label_list += [1,0,0]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '. '  # B A.
                tmp_record += author
                tmp_label_list += [1,0,0]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))





    # =============================================================================Verbose: Print the training set
    for record, observation, label in zip(records, observation_list, label_list):
        for rr, oo, ll in zip(token_generator.tokenize(record)['tokens'], observation, label):
            if ll == 0:
                ll = 'FN'
            elif ll == 1:
                ll = 'LN'
            elif ll == 2:
                ll = 'DL'
            elif ll == 3:
                ll = 'TI'
            elif ll == 4:
                ll = 'VN'
            elif ll == 5:
                ll = 'DT'
            print oo, '\t', ll.encode('utf-8'), '\t', rr.encode('utf-8')
        print '\n\n'

    return observation_list, label_list

コード例 #3

ファイルを表示

ファイル: feature.py プロジェクト: xiaoyao1991/hmmpy

class FeatureGenerator(object):
    """
        @param:
            record -> piece of raw_text, or a list of tokens
    """
    def __init__(self, feature_for_separate_model=False):
        super(FeatureGenerator, self).__init__()
        self.dictionary = enchant.Dict('en_US')
        self.token_generator = Tokens()  # Connection established!
        self.record = None
        self.tokens = []
        self.features = None  # list of list of features for every name; e.g. [[1,1,1,1],[...], ...]

        # Regex setup
        self.NUM_REGEX = re.compile('\d')
        self.CHAR_DIGIT_MIX_REGEX = re.compile(
            '((^[a-zA-Z]+\d{4}$)|(^[a-zA-Z]+\d{2}$))|((^\d{4}[a-zA-Z]+$)|(^\d{2}[a-zA-Z]+$))',
            re.MULTILINE)
        self.NAME_ABBREV_REGEX = re.compile(
            '([A-Z]\.-[A-Z]\.)|([A-Z]\.-[A-Z])|([A-Z]\.-)|(([A-Z]\.)+)|(O\'[A-Z][a-z]+)'
        )  #C.P.; C.-C.; O'Reilly
        self.PAGE_NO_REGEX = re.compile('\d+-\d+')

        # Gazzatte setup
        self.DELIMITERS = [
            ',',
            '.',
        ]
        self.LBRACKET = [
            '(',
            '[',
            '{',
            '<',
        ]
        self.RBRACKET = [
            ')',
            ']',
            '}',
            '>',
        ]
        self.APOSTROPHES = [
            "'s",
            "'re",
            "'d",
        ]
        self.QUOTATIONS = [
            '"',
            "''",
            "``",
        ]
        self.MONTHS = [
            'Janurary', 'February', 'March', 'April', 'May', 'June', 'July',
            'August', 'September', 'October', 'November', 'December'
        ]
        self.NAME_LIST = [
            item.strip() for item in open('data/name.lst', 'r').readlines()
        ]
        self.VENUE_LIST = [
            item.strip() for item in open('data/venue.lst', 'r').readlines()
        ]
        self.ORDINAL_LIST = [
            item.strip() for item in open('data/ordinal.lst', 'r').readlines()
        ]
        # self.CITY_LIST = [item.strip() for item in open('data/cities.lst','r').readlines()]
        self.COUNTRY_LIST = [
            item.strip()
            for item in open('data/countries.lst', 'r').readlines()
        ]

        if feature_for_separate_model:
            self.pipeline = PARTIAL_PIPELINE
        else:
            self.pipeline = STANDARD_PIPELINE

    def close_connection(self):
        self.token_generator.close_connection()

    def build(self, record):
        self.record = record

        features = []
        need_tokenize = True
        if type(self.record) is list:
            need_tokenize = False
        else:
            need_tokenize = True

        # record raw texts
        if need_tokenize:
            response_obj = self.token_generator.tokenize(self.record)
            self.tokens = response_obj['tokens']

        # Already tokenized input
        else:
            self.tokens = self.record

        self.num_tokens = len(
            self.tokens
        )  # count how many tokens are there in this piece of text.

        for i in range(self.num_tokens):
            sub_features = []
            for pipe in self.pipeline:
                action = getattr(self, pipe)
                sub_features.append(action(i))
            features.append(sub_features)
        self.features = features

        return features

    def token_length(self, record):
        return self.token_generator.token_length(record)

    def print_features(self):
        for i in range(self.num_tokens):
            print self.features[i], '\t\t', self.tokens[i]

    ################################### Feature functions ###################################
    # Feature output format:
    # [
    #   [([1,0,0,1], 1), ([1,1,1,1], 0), (...)...], <-- One piece of training sample (x, y) where x=x1x2x3...xm, y=y1y2y3...ym <-- a sentence representation in feature vectors, in sequence
    #   [.......................],  <-- another sentence, parallel with the previous sentence, independent processed
    #   ...
    # ]
    # Assume segment is space-delimited, so it's a feature for the segmentm challenge will be tokenizing
    ################################### Local Features #####################################

    # C.B. or C.-C
    def f_is_name_abbrev(self, idx):
        token = self.tokens[idx]
        if self.NAME_ABBREV_REGEX.match(token) is None:
            return 0
        return 1

    def f_is_apostrophes(self, idx):
        token = self.tokens[idx]
        return int(token in self.APOSTROPHES)

    def f_is_capitalized(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(token[0].isupper())

    def f_is_all_upper(self, idx):
        token = self.tokens[idx]
        if len(token) <= 2:
            return 0
        return int(token.isupper())

    def f_is_english(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(self.dictionary.check(token.lower()) and len(token) > 1)

    def f_has_both_char_and_digit(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        if self.CHAR_DIGIT_MIX_REGEX.search(token) is None:
            return 0
        return 1

    def f_is_delimiter(self, idx):
        token = self.tokens[idx]
        if len(token) != 1:
            return 0
        return int(token in self.DELIMITERS)

    def f_is_quotation(self, idx):
        token = self.tokens[idx]
        return int(token in self.QUOTATIONS)

    def f_is_punctuation(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(len(token) == 1 and token in punctuation)

    # def f_is_sequential_punctuation(self, idx): #e.g. K.C.-C. Chang
    #     token = self.tokens[idx]
    #     if len(token) <= 1:
    #         return 0
    #     ret = 1
    #     for t in token:
    #         if t not in punctuation:
    #             ret = 0
    #             break
    #     return ret

    def f_has_digit(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        if self.NUM_REGEX.search(token) is None:
            return 0
        return 1

    def f_is_all_digit(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(token.isdigit())

    def f_is_possible_page_number(self, idx):
        token = self.tokens[idx]
        if self.PAGE_NO_REGEX.match(token) is None:
            return 0
        return 1

    def f_is_month(self, idx):
        token = self.tokens[idx]
        return int(token in self.MONTHS)

    def f_is_possible_year(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(token.isdigit() and len(token) == 4 and int(token) >= 1980
                   and int(token) <= datetime.now().year)

    ################################### Dictionary Features ################################
    def f_is_in_namelist(self, idx):
        token = self.tokens[idx].encode('ascii', 'ignore')
        if len(token) == 0:
            return 0
        return int(token.lower().strip() in self.NAME_LIST)

    def f_is_ordinal(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        return int(token.lower().strip() in self.ORDINAL_LIST)

    # Also handled some of the common venue tokens that are also common in English????
    # TODO: more delicate
    def f_is_in_venuelist(self, idx):
        token = self.tokens[idx].encode('ascii', 'ignore')
        if len(token) == 0:
            return 0
        if (idx - 1) >= 0:
            prev_token = self.tokens[idx - 1]
        else:
            prev_token = ''

        # Special case handling
        if token.strip() in [
                'In',
                'Appear',
                'Appears',
                'Appeared',
        ] and len(prev_token) > 0 and prev_token in [
                '.',
                ',',
                ';',
                '(',
        ]:
            return 1

        return int(token.lower().strip() in (self.VENUE_LIST +
                                             self.ORDINAL_LIST +
                                             self.COUNTRY_LIST))

    ################################### Global Features ####################################

    def f_has_lbracket_before(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        if (idx - 1) >= 0:
            prev_token = self.tokens[idx - 1]
        else:
            return 0
        return int(prev_token in self.LBRACKET)

    def f_has_rbracket_after(self, idx):
        token = self.tokens[idx]
        if (idx + 1) < self.num_tokens:
            next_token = self.tokens[idx + 1]
        else:
            return 0
        return int(next_token in self.RBRACKET)

    def f_has_quotation_before(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        if (idx - 1) >= 0:
            prev_token = self.tokens[idx - 1]
        else:
            return 0
        return int(prev_token in self.QUOTATIONS)

    def f_has_quotation_after(self, idx):
        token = self.tokens[idx]
        if (idx + 1) < self.num_tokens:
            next_token = self.tokens[idx + 1]
        else:
            return 0
        return int(next_token in self.QUOTATIONS)

    #????
    def f_is_possible_volume(self, idx):
        token = self.tokens[idx]
        if ((idx - 1) >= 0) and ((idx + 1) < self.num_tokens):
            prev_token = self.tokens[idx - 1]
            next_token = self.tokens[idx + 1]
            return int(prev_token in self.LBRACKET
                       and next_token in self.RBRACKET and token.isdigit())
        else:
            return 0

    # ???? necessary?
    def f_is_at_second_half_of_string(self, idx):
        token = self.tokens[idx]
        return int(idx > self.num_tokens / 2)

    def f_has_delimiter_before(self, idx):
        token = self.tokens[idx]
        if len(token) == 0:
            return 0
        if (idx - 1) >= 0:
            prev_token = self.tokens[idx - 1]
        else:
            return 0
        return int(len(prev_token) == 1 and prev_token in self.DELIMITERS)

    def f_has_delimiter_after(self, idx):
        token = self.tokens[idx]
        if (idx + 1) < self.num_tokens:
            next_token = self.tokens[idx + 1]
        else:
            return 0
        return int(len(next_token) == 1 and next_token in self.DELIMITERS)

    #????
    def f_is_an_and_between_two_names(self, idx):
        token = self.tokens[idx]
        if (idx + 1) < self.num_tokens and (idx - 1) >= 0:
            next_token = self.tokens[idx + 1]
            prev_token = self.tokens[idx - 1]
        else:
            return 0
        return int(token.strip().lower() == 'and'
                   and self.f_is_capitalized(idx - 1)
                   and (self.f_is_english(idx - 1) == 0))

    def f_is_followed_by_year(self, idx):
        token = self.tokens[idx]
        if (idx + 1) < self.num_tokens:
            next_token = self.tokens[idx + 1]
        else:
            return 0
        return int((len(next_token) == 2 or len(next_token) == 4)
                   and next_token.isdigit() and not token.isdigit())

    # Addressing the possible new notions in the title of publications
    def f_is_possible_new_notion(self, idx):
        token = self.tokens[idx]
        if (idx + 2) < self.num_tokens:
            next_token = self.tokens[idx + 1]
            next_next_token = self.tokens[idx + 2]
        else:
            return 0
        p1 = re.compile(r'^[A-Z][a-z0-9]+[A-Z][a-z0-9]+$', re.MULTILINE)
        p2 = re.compile(r'^[A-Z][a-z0-9]+$', re.MULTILINE)
        p3 = re.compile(r'^[A-Z][a-z0-9]+[A-Z][a-z0-9]+[A-Z][a-z0-9]+$',
                        re.MULTILINE)
        p4 = re.compile(r'^[a-z0-9]+$', re.MULTILINE)
        p5 = re.compile(r'[A-Z]*[A-Za-z]+-[A-Za-z]+'
                        )  #specific terminology ???? content-aware; Group-By
        # Xxxxxx, XxxxxXxxxx, XxxxXxxxXxxx, Xxxx xxxx, Xxxx Xxxx, XXXX

        pattern_1 = token.isupper() and next_token == ':'
        pattern_2 = (p1.match(token) is not None) and next_token == ':'
        pattern_3 = (p2.match(token) is not None) and next_token == ':'
        pattern_4 = (p3.match(token) is not None) and next_token == ':'
        pattern_5 = (p2.match(token)
                     is not None) and (p2.match(next_token)
                                       is not None) and next_next_token == ':'
        pattern_6 = (p2.match(token)
                     is not None) and (p4.match(next_token)
                                       is not None) and next_next_token == ':'
        pattern_7 = p5.match(token) is not None

        return int(pattern_1 or pattern_2 or pattern_3 or pattern_4
                   or pattern_5 or pattern_6 or pattern_7)

    def f_is_possible_boundary(self, idx):  #check if period.  Pending feature
        token = self.tokens[idx]
        if (idx + 1) < self.num_tokens and (idx - 1) >= 0:
            next_token = self.tokens[idx + 1]
            prev_token = self.tokens[idx - 1]
        else:
            return 0

        return int(
            (token == '.' and prev_token.islower() and next_token[0].isupper())
            or (token[-1] == '.' and token[0].islower()
                and next_token[0].isupper()))