Esempi in Python per FeatureGenerator.build

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: feature

Classe/tipologia: FeatureGenerator

Metodo/funzione: build

Esempi su hotexamples.com: 2

FeatureGenerator.build in Python: 2 esempi trovati. Questi sono i migliori esempi reali in Python per feature.FeatureGenerator.build, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

FeatureGenerator(12)

features(5)

build(2)

close_connection(1)

extract(1)

print_features(1)

token_length(1)

Esempio n. 1

Mostra file

File: training_set_generator.py Progetto: xiaoyao1991/hmmpy

def get_training_samples_raw_cora():
    fp = open(CORA_RAW_FILE_PATH, 'r')
        
    feature_generator = FeatureGenerator()

    observation_list = []
    label_list = []

    for data in fp:
        # Parse every piece of training data(single piece of tagged publication)
        soup = BeautifulSoup(data)
        tmp_observation_list = []
        tmp_label_list = []
        for child in soup.body.children:
            if type(child) is bs4.element.Tag:
                raw_label = child.name
                raw_text = child.text

                label = LABEL_INT_MAP[raw_label]    # int label
                feature_vectors = feature_generator.build(raw_text)
                feature_generator.print_features()
                tmp_observation_list += feature_vectors
                tmp_label_list += [label] * len(feature_vectors)
            else:
                continue

        observation_list.append(tmp_observation_list)
        label_list.append(tmp_label_list)


    feature_generator.close_connection()
    return observation_list, label_list

Esempio n. 2

Mostra file

File: training_set_generator.py Progetto: xiaoyao1991/hmmpy

def get_training_samples(url):
    log_err('\tGetting Training sample')
    raw_results = router(url)
    log_err('\tData retrieved. Preprocessing...')
    observation_list = []
    label_list = []
    records = []

    feature_generator = FeatureGenerator()
    token_generator = Tokens()

    for raw_result in raw_results:
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []

        authors = raw_result['authors']
        title = raw_result['title']        
        title_copy = raw_result['title']

        try:
            venue = raw_result['conference name']
            venue_copy = raw_result['conference name']
        except:
            venue = ''
            venue_copy = ''
        try:
            venue = raw_result['journal name']
            venue_copy = raw_result['journal name']
        except:
            venue = ''
            venue_copy = ''

        if len(venue) > 0:
            try:
                volume = raw_result['volume']
            except:
                volume = ''
            try:
                issue = raw_result['issue']
            except:
                issue = ''
            try:
                page = raw_result['page']
            except:
                page = ''

            venue += ' ' + volume + ' ' + issue + ' ' + page
            venue_copy += ' ' + volume + ' ' + issue + ' ' + page


        date = raw_result['publication date'][:4]

        # FN: 0
        # LN: 1
        # DL: 2
        # TI: 3
        # VN: 4
        # DT: 5

        # Author -> Title -> ...
        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author += ' , '
            tmp_record += author
            tmp_label_list += [0] * (feature_generator.token_length(author)-2)
            tmp_label_list += [1,2]
                
        # title
        title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)

        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        # title
        # title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author += ' , '
            tmp_record += author
            tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
            tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))



        #=================================Variations of authors=================================
        # Changing order, inserting dot, and probably insert comma as delimiter inside of names
        # This part of variations is very sensitive to what sample source to choose from,
        # for example, Google scholar is the current source of samples, and on gscholar, 
        # most names are in format of JW Han.  <-- Prior knowledge
        # Read more Learn more Change the Globe !!!
        log_err('\tGenerating multiple cases for name variations... ')
        # ================================A. B
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        # authors
        for author in authors:
            if len(author) == 0:
                continue

            #???? BUG!!!! split() doesn't mean tokenization
            author_tokens = token_generator.tokenize(author)['tokens']  # Split the author in order tokens
            if len(author_tokens) == 1:     # Cannot change order or anything, so leave this name alone, and pass to the next name
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Insert dot
                author = author_tokens[0] + '.' + author_tokens[1] + ' , '  # A. B
                tmp_token_length = token_generator.token_length(author)
                tmp_record += author
                tmp_label_list += [0]*(tmp_token_length-2) + [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        # title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']  # Split the author in order to
            if len(author_tokens) == 1:     # Cannot change order or anything, so leave this name alone, and pass to the next name
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Insert dot
                author = author_tokens[0] + '.' + author_tokens[1] + ' , '  # A. B
                tmp_token_length = token_generator.token_length(author)
                tmp_record += author
                tmp_label_list += [0]*(tmp_token_length-2) + [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # ================================B, 
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Only keep lastname
                author = author_tokens[1] + ' , '  # B
                tmp_record += author
                tmp_label_list += [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        # title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Only keep lastname
                author = author_tokens[1] + ' , '  # B
                tmp_record += author
                tmp_label_list += [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))



        # ================================B A., 
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '.,'  # B A.,
                tmp_record += author
                tmp_label_list += [1,0,0,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        # title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '.,'  # B A.,
                tmp_record += author
                tmp_label_list += [1,0,0,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # ================================B A.
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '. '  # B A.
                tmp_record += author
                tmp_label_list += [1,0,0]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        # title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '. '  # B A.
                tmp_record += author
                tmp_label_list += [1,0,0]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))






        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        # Period Case!!!
        log_err('\tGenerating multiple cases for period as DL... ')
        # Author -> Title -> ...
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author += ' , '
            tmp_record += author
            tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
            tmp_label_list += [1,2]
                
        # title
        title = title_copy + ' . '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            venue = venue_copy + ' . '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))



        # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author += ' , '
            tmp_record += author
            tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
            tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # ================================A. B
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        # authors
        for author in authors:
            if len(author) == 0:
                continue

            author_tokens = token_generator.tokenize(author)['tokens']  # Split the author in order tokens
            if len(author_tokens) == 1:     # Cannot change order or anything, so leave this name alone, and pass to the next name
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Insert dot
                author = author_tokens[0] + '.' + author_tokens[1] + ' , '  # A. B
                tmp_token_length = token_generator.token_length(author)
                tmp_record += author
                tmp_label_list += [0]*(tmp_token_length-2) + [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']  # Split the author in order to
            if len(author_tokens) == 1:     # Cannot change order or anything, so leave this name alone, and pass to the next name
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Insert dot
                author = author_tokens[0] + '.' + author_tokens[1] + ' , '  # A. B
                tmp_token_length = token_generator.token_length(author)
                tmp_record += author
                tmp_label_list += [0]*(tmp_token_length-2) + [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # ================================B, 
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Only keep lastname
                author = author_tokens[1] + ' , '  # B
                tmp_record += author
                tmp_label_list += [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Only keep lastname
                author = author_tokens[1] + ' , '  # B
                tmp_record += author
                tmp_label_list += [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))



        # ================================B A., 
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '.,'  # B A.,
                tmp_record += author
                tmp_label_list += [1,0,0,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '.,'  # B A.,
                tmp_record += author
                tmp_label_list += [1,0,0,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

       
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # ================================B A.
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '. '  # B A.
                tmp_record += author
                tmp_label_list += [1,0,0]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '. '  # B A.
                tmp_record += author
                tmp_label_list += [1,0,0]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))





    # =============================================================================Verbose: Print the training set
    for record, observation, label in zip(records, observation_list, label_list):
        for rr, oo, ll in zip(token_generator.tokenize(record)['tokens'], observation, label):
            if ll == 0:
                ll = 'FN'
            elif ll == 1:
                ll = 'LN'
            elif ll == 2:
                ll = 'DL'
            elif ll == 3:
                ll = 'TI'
            elif ll == 4:
                ll = 'VN'
            elif ll == 5:
                ll = 'DT'
            print oo, '\t', ll.encode('utf-8'), '\t', rr.encode('utf-8')
        print '\n\n'

    return observation_list, label_list