Python FeatureGenerator Examples, feature.FeatureGenerator Python Examples

Example #1

0

Show file

File: model.py Project: VincentYu68/RSTParser

    def sr_parse(self, texts):
        """ Shift-reduce RST parsing based on model prediction

        :type texts: list of string
        :param texts: list of EDUs for parsing
        """
        # Initialize parser
        srparser = SRParser([], [])
        srparser.init(texts)
        # Parsing
        while not srparser.endparsing():
            # Generate features
            stack, queue = srparser.getstatus()
            # Make sure call the generator with
            # same arguments as in data generation part
            fg = FeatureGenerator(stack, queue)
            features = fg.features()
            label = self.predict(features)
            action = label2action(label)
            # The best choice here is to choose the first
            #   legal action
            try:
                srparser.operate(action)
            except ActionError:
                print "Parsing action error with {}".format(action)
                sys.exit()
        tree = srparser.getparsetree()
        rst = RSTTree(tree=tree)
        return rst

Example #2

0

Show file

File: model.py Project: faithannong/DPLP-faith

    def sr_parse(self, doc, bcvocab=None):
        """ Shift-reduce RST parsing based on model prediction

        :type texts: list of string
        :param texts: list of EDUs for parsing

        :type bcvocab: dict
        :param bcvocab: brown clusters
        """
        # raise NotImplementedError("Not finished yet")
        # Initialize parser
        srparser = SRParser([], [])
        srparser.init(doc)
        # Parsing
        while not srparser.endparsing():
            # Generate features
            stack, queue = srparser.getstatus()
            # Make sure call the generator with
            # same arguments as in data generation part
            fg = FeatureGenerator(stack, queue, doc, bcvocab)
            feat = fg.features()
            # label = self.predict(feat)
            labels = self.rank_labels(feat)
            for label in labels:
                action = label2action(label)
                try:
                    srparser.operate(action)
                    break
                except ActionError:
                    # print "Parsing action error with {}".format(action)
                    pass
        tree = srparser.getparsetree()
        rst = RSTTree()
        rst.asign_tree(tree)
        return rst

Example #3

0

Show file

    def sr_parse(self, texts):
        """ Shift-reduce RST parsing based on model prediction

        :type texts: list of string
        :param texts: list of EDUs for parsing
        """
        # Initialize parser
        srparser = SRParser([],[])
        srparser.init(texts)
        # Parsing
        while not srparser.endparsing():
            # Generate features
            stack, queue = srparser.getstatus()
            # Make sure call the generator with
            # same arguments as in data generation part
            fg = FeatureGenerator(stack, queue)
            features = fg.features()
            labels = self.predict(features)
            # Enumerate through all possible actions ranked based on predcition scores
            for i,label in enumerate(labels):
                action = label2action(label)                
                try:
                    srparser.operate(action)
                    break # if legal action, end the loop
                except ActionError:
                    if i < len(labels): # if not a legal action, try the next possible action
                        continue
                    else:               
                        print "Parsing action error with {}".format(action)
                        sys.exit()
  
        tree = srparser.getparsetree()
        rst = RSTTree(tree=tree)
        return rst

Example #4

0

Show file

File: model.py Project: OlafLee/DPLP

    def sr_parse(self, doc, bcvocab=None):
        """ Shift-reduce RST parsing based on model prediction

        :type texts: list of string
        :param texts: list of EDUs for parsing

        :type bcvocab: dict
        :param bcvocab: brown clusters
        """
        # raise NotImplementedError("Not finished yet")
        # Initialize parser
        srparser = SRParser([], [])
        srparser.init(doc)
        # Parsing
        while not srparser.endparsing():
            # Generate features
            stack, queue = srparser.getstatus()
            # Make sure call the generator with
            # same arguments as in data generation part
            fg = FeatureGenerator(stack, queue, doc, bcvocab)
            feat = fg.features()
            # label = self.predict(feat)
            labels = self.rank_labels(feat)
            for label in labels:
                action = label2action(label)
                try:
                    srparser.operate(action)
                    break
                except ActionError:
                    # print "Parsing action error with {}".format(action)
                    pass
        tree = srparser.getparsetree()
        rst = RSTTree()
        rst.asign_tree(tree)
        return rst

Example #5

0

Show file

File: model.py Project: jerryyeezus/nlp-summarization

    def sr_parse(self, texts):
        """ Shift-reduce RST parsing based on model prediction

        :type texts: list of string
        :param texts: list of EDUs for parsing
        """
        # Initialize parser
        srparser = SRParser([],[])
        srparser.init(texts)
        # Parsing
        while not srparser.endparsing():
            # Generate features
            stack, queue = srparser.getstatus()
            # Make sure call the generator with
            # same arguments as in data generation part
            fg = FeatureGenerator(stack, queue)
            features = fg.features()
            labels = self.predict(features)
            # Enumerate through all possible actions ranked based on predcition scores
            for i,label in enumerate(labels):
                action = label2action(label)                
                try:
                    srparser.operate(action)
                    break # if legal action, end the loop
                except ActionError:
                    if i < len(labels): # if not a legal action, try the next possible action
                        continue
                    else:               
                        print "Parsing action error with {}".format(action)
                        sys.exit()
  
        tree = srparser.getparsetree()
        rst = RSTTree(tree=tree)

        return rst

Example #6

0

Show file

File: model.py Project: MentalMasochist/RSTParser

    def sr_parse(self, texts, d_pos, d_dep):
        """ Shift-reduce RST parsing based on model prediction

        :type texts: list of string
        :param texts: list of EDUs for parsing
        """
        # Initialize parser
        srparser = SRParser([],[])
        srparser.init(texts, d_pos, d_dep)
        # Parsing
        while not srparser.endparsing():
            # Generate features
            stack, queue = srparser.getstatus()
            # Make sure call the generator with
            # same arguments as in data generation part
            fg = FeatureGenerator(stack, queue)
            features = fg.features()
            label = self.predict(features)
            action = label2action(label)
            # The best choice here is to choose the first
            #   legal action
            try:
                srparser.operate(action)
            except ActionError:
                print "Parsing action error with {}".format(action)
                sys.exit()
        tree = srparser.getparsetree()
        rst = RSTTree(tree=tree)
        return rst

Example #7

0

Show file

File: vocab.py Project: gajanlee/RST_summary

 def __init__(self, thresh=20, topn=100):
     """ Initialization
     """
     self.vocab = {}
     self.features = defaultdict(float)
     self.thresh = thresh
     self.topn = topn
     self.fg = FeatureGenerator()

Example #8

0

Show file

File: sample.py Project: OlafLee/DPLP

 def __init__(self, vocab):
     """ Initialization
     """
     self.vocab = vocab
     self.fg = FeatureGenerator()
     self.featdict = {}
     self.labels = []

Example #9

0

Show file

    def evaluate(self, raw_segment, log=True, normalize=False):
        # 1. Translate token list to feature vectors
        observation_sequence = FeatureGenerator(raw_segment).features

        # 2. Encode the feature vectors into integers
        encoded_sequence = []
        for vector in observation_sequence:
            key = str(vector)
            encoded_symbol = self.feature_symbol_mapping[key]
            encoded_sequence.append(encoded_symbol)

        # print encoded_sequence
        print observation_sequence

        # 3. Call the forward algorithm
        likelihood, log_likelihood = self.forward(encoded_sequence)

        # 4. Normalize the score by length  ???? not work
        if normalize:
            likelihood = float(len(observation_sequence)) * likelihood
            log_likelihood = math.log(float(
                len(observation_sequence))) + log_likelihood

        if log:
            return log_likelihood
        return likelihood

Example #10

0

Show file

File: vocab.py Project: OlafLee/DPLP

 def __init__(self, thresh=20, topn=100):
     """ Initialization
     """
     self.vocab = {}
     self.features = defaultdict(float)
     self.thresh = thresh
     self.topn = topn
     self.fg = FeatureGenerator()

Example #11

0

Show file

File: training_set_generator.py Project: xiaoyao1991/hmmpy

def get_training_samples_raw_cora():
    fp = open(CORA_RAW_FILE_PATH, 'r')
        
    feature_generator = FeatureGenerator()

    observation_list = []
    label_list = []

    for data in fp:
        # Parse every piece of training data(single piece of tagged publication)
        soup = BeautifulSoup(data)
        tmp_observation_list = []
        tmp_label_list = []
        for child in soup.body.children:
            if type(child) is bs4.element.Tag:
                raw_label = child.name
                raw_text = child.text

                label = LABEL_INT_MAP[raw_label]    # int label
                feature_vectors = feature_generator.build(raw_text)
                feature_generator.print_features()
                tmp_observation_list += feature_vectors
                tmp_label_list += [label] * len(feature_vectors)
            else:
                continue

        observation_list.append(tmp_observation_list)
        label_list.append(tmp_label_list)


    feature_generator.close_connection()
    return observation_list, label_list

Example #12

0

Show file

File: vocab.py Project: OlafLee/DPLP

class VocabGenerator(object):
    def __init__(self, thresh=20, topn=100):
        """ Initialization
        """
        self.vocab = {}
        self.features = defaultdict(float)
        self.thresh = thresh
        self.topn = topn
        self.fg = FeatureGenerator()

        
    def build(self, doc):
        """ Extract features for a given doc

        :type doc: Doc instance
        :param doc: 
        """
        featdict = self.fg.extract(doc)
        for (idx, featlist) in featdict.iteritems():
            for feat in featlist:
                self.features[feat] += 1.0

    
    def select(self):
        """ Select top-n features according to frequency
        """
        pass


    def filter(self):
        """ Filter out low-frequency features with thresh
        """
        index = 0
        for (feat, freq) in self.features.iteritems():
            if freq >= self.thresh:
                self.vocab[feat] = index
                index += 1

    def getvocab(self):
        """ Return vocab
        """
        if len(self.vocab) == 0:
            raise ValueError("Empty vocab")
        return self.vocab

    def savevocab(self, fvocab):
        """ Dump vocab into a pickle file
        """
        if not fvocab.endswith('pickle.gz'):
            fvocab += 'pickle.gz'
        fout = gzip.open(fvocab, 'w')
        if len(self.vocab) == 0:
            raise ValueError("Empty vocab")
        dump(self.vocab, fout)
        print "Save vocab into file: {}".format(fvocab)

Example #13

0

Show file

class VocabGenerator(object):
    def __init__(self, thresh=20, topn=100):
        """ Initialization
        """
        self.vocab = {}
        self.features = defaultdict(float)
        self.thresh = thresh
        self.topn = topn
        self.fg = FeatureGenerator()

    def build(self, doc):
        """ Extract features for a given doc

        :type doc: Doc instance
        :param doc: 
        """
        featdict = self.fg.extract(doc)
        for (idx, featlist) in featdict.iteritems():
            for feat in featlist:
                self.features[feat] += 1.0

    def select(self):
        """ Select top-n features according to frequency
        """
        pass

    def filter(self):
        """ Filter out low-frequency features with thresh
        """
        index = 0
        for (feat, freq) in self.features.iteritems():
            if freq >= self.thresh:
                self.vocab[feat] = index
                index += 1

    def getvocab(self):
        """ Return vocab
        """
        if len(self.vocab) == 0:
            raise ValueError("Empty vocab")
        return self.vocab

    def savevocab(self, fvocab):
        """ Dump vocab into a pickle file
        """
        if not fvocab.endswith('pickle.gz'):
            fvocab += 'pickle.gz'
        fout = gzip.open(fvocab, 'w')
        if len(self.vocab) == 0:
            raise ValueError("Empty vocab")
        dump(self.vocab, fout)
        print
        "Save vocab into file: {}".format(fvocab)

Example #14

0

Show file

 def generate_samples(self):
     """ Generate samples from an binary RST tree
     """
     # Sample list
     samplelist = []
     # Parsing action
     actionlist = decodeSRaction(self.tree)
     # Initialize queue and stack
     queue = getedunode(self.tree)
     stack = []
     # Start simulating the shift-reduce parsing
     for action in actionlist:
         # Generate features
         fg = FeatureGenerator(stack, queue)
         features = fg.features()
         samplelist.append(features)
         # Change status of stack/queue
         sr = SRParser(stack, queue)
         sr.operate(action)
         # stack, queue = sr.getstatus()
     return (actionlist, samplelist)

Example #15

0

Show file

File: tree.py Project: jerryyeezus/nlp-summarization

 def generate_samples(self):
     """ Generate samples from an binary RST tree
     """
     # Sample list
     samplelist = []
     # Parsing action
     actionlist = decodeSRaction(self.tree)
     # Initialize queue and stack
     queue = getedunode(self.tree)
     stack = []
     # Start simulating the shift-reduce parsing
     for action in actionlist:
         # Generate features
         fg = FeatureGenerator(stack, queue)
         features = fg.features()
         samplelist.append(features)
         # Change status of stack/queue
         sr = SRParser(stack, queue)
         sr.operate(action)
         # stack, queue = sr.getstatus()
     return (actionlist, samplelist)

Example #16

0

Show file

File: sample.py Project: OlafLee/DPLP

class SampleGenerator(object):
    def __init__(self, vocab):
        """ Initialization
        """
        self.vocab = vocab
        self.fg = FeatureGenerator()
        self.featdict = {}
        self.labels = []

    def build(self, doc):
        """ Build training examples from ONE doc
        """
        N = len(self.featdict)
        index = 0
        featdct = self.fg.extract(doc)
        for (gidx, featlist) in featdct.iteritems():
            self.featdict[N+index] = featlist
            if doc.tokendict[gidx].boundary is not None:
                # No boundary indicator
                if doc.tokendict[gidx].boundary:
                    self.labels.append(1)
                else:
                    self.labels.append(0)
            index += 1
        # print "Read {} samples".format(len(self.featdict))
        # print len(self.featdict), len(self.labels)

    def getmat(self):
        """ Vectorize all elements in featdict
        """
        nRow = len(self.featdict)
        nCol = len(self.vocab)
        Datadict = defaultdict(float)
        Ridx, Cidx, Val = [], [], []
        for ridx in range(nRow):
            # if ridx % 10000 == 0:
            #     print ridx
            for feat in self.featdict[ridx]:
                try:
                    cidx = self.vocab[feat]
                    Datadict[(ridx, cidx)] += 1.0
                except KeyError:
                    pass
        # Convert it to COO format
        for (key, val) in Datadict.iteritems():
            Ridx.append(key[0])
            Cidx.append(key[1])
            Val.append(val)
        M = coo_matrix((Val, (Ridx,Cidx)), shape=(nRow,nCol))
        # print 'Dim of matrix: {}'.format(M.shape)
        return (M, self.labels)

Example #17

0

Show file

    def decode_without_constraints(self, raw_segment):
        # 1. Translate token list to feature vectors
        fg = FeatureGenerator(raw_segment)
        observation_sequence = fg.features
        observation_tokens = fg.tokens

        # 2. Encode the feature vectors into integers
        encoded_sequence = []
        for vector in observation_sequence:
            key = str(vector)
            encoded_symbol = self.feature_symbol_mapping[key]
            encoded_sequence.append(encoded_symbol)

        # 3. Call the viterbi algorithm
        decoded_label_sequence = self.viterbi(encoded_sequence)
        return observation_sequence, decoded_label_sequence

Example #18

0

Show file

File: retrainer.py Project: xiaoyao1991/hmmpy

 def run(self):
     i = 0
     self.new_labels = []
     for raw_segment, label_sequence in zip(self.raw_segments,
                                            self.label_sequences):
         new_labels = self.hmm_new.decode(raw_segment)[1]
         self.new_labels.append(new_labels)
         tokens = Tokens(raw_segment).tokens
         feature_vectors = FeatureGenerator(raw_segment).features
         print i, ':  ', raw_segment
         for token, old_label, new_label, feature_vector in zip(
                 tokens, label_sequence, new_labels, feature_vectors):
             print to_label(old_label), '\t', to_label(
                 new_label), '\t', token
             self.feature_entity_list.add_entity(
                 feature_vector, old_label, token)  #???? Old label first
         print '\n'
         i += 1

Example #19

0

Show file

    def evaluate_exhaustive(self, raw_segment):
        # 1. Translate token list to feature vectors
        observation_sequence = FeatureGenerator(raw_segment).features

        # 2. Encode the feature vectors into integers
        encoded_sequence = []
        for vector in observation_sequence:
            key = str(vector)
            encoded_symbol = self.feature_symbol_mapping[key]
            encoded_sequence.append(encoded_symbol)

        # print encoded_sequence
        print observation_sequence

        # 3. Get the space of all possible hidden state sequence
        sequence_length = len(observation_sequence)
        likelihood = 0.0
        Y = get_binary_vector(self.observation_dim**sequence_length,
                              sequence_length)
        for y in Y:
            likelihood += self.evaluate_helper(observation_sequence, y)

        return likelihood

Example #20

0

Show file

File: model.py Project: parry2403/CodeRepo

    def sr_parse(self, texts,fname):
        """ Shift-reduce RST parsing based on model prediction

        :type texts: list of string
        :param texts: list of EDUs for parsing
        """
        # Initialize parser
        srparser = SRParser([],[])

        dep = defaultdict()
        pos = defaultdict()
        lines =defaultdict()
   # print fname.split(".dis")[0]+'.dep'
        dir = fname.split

        s =fname.split(".edus")
 #   print fname
  #  st= fname
        if fname.endswith(".out.edus"):
         #   print "yes"
            s= fname.split(".out.edus")


        f= open(s[0]+'.dep',"r")
        data = f.read().splitlines()
        for line in data:
        #   print line
           l = line.split('@#%^&*')
           dep[l[0]] = l[1]

        f= open(s[0]+'.pos',"r")
        data = f.read().splitlines()
        for line in data:
        #   print line
               l = line.split('@#%^&*')
               pos[l[0]] = l[1].strip()
        f= open(s[0]+'.line',"r")
        data = f.read().splitlines()
        for line in data:
        #   print line
               l = line.split('@#%^&*')
               lines[l[0]] = l[1]





        srparser.init(texts,pos,dep,lines)
        # Parsing
        while not srparser.endparsing():
            # Generate features
            stack, queue = srparser.getstatus()
            # Make sure call the generator with
            # same arguments as in data generation part
            fg = FeatureGenerator(stack, queue)
            features = fg.features()
            labels = self.predict(features)
            # Enumerate through all possible actions ranked based on predcition scores
            for i,label in enumerate(labels):
                action = label2action(label)                
                try:
                    srparser.operate(action)
                    break # if legal action, end the loop
                except ActionError:
                    if i < len(labels): # if not a legal action, try the next possible action
                        continue
                    else:               
                        print "Parsing action error with {}".format(action)
                        sys.exit()
  
        tree = srparser.getparsetree()
        rst = RSTTree(tree=tree)
        return rst

Example #21

0

Show file

File: model.py Project: parry2403/R2N2

    def sr_parse(self, texts,fname):
        """ Shift-reduce RST parsing based on model prediction

        :type texts: list of string
        :param texts: list of EDUs for parsing
        """
        # Initialize parser
        srparser = SRParser([],[])

        dep = defaultdict()
        pos = defaultdict()
        lines =defaultdict()
   # print fname.split(".dis")[0]+'.dep'
        dir = fname.split

        s =fname.split(".edus")
 #   print fname
  #  st= fname
        if fname.endswith(".out.edus"):
         #   print "yes"
            s= fname.split(".out.edus")


        f= open(s[0]+'.dep',"r")
        data = f.read().splitlines()
        for line in data:
        #   print line
           l = line.split('@#%^&*')
           dep[l[0]] = l[1]

        f= open(s[0]+'.pos',"r")
        data = f.read().splitlines()
        for line in data:
        #   print line
               l = line.split('@#%^&*')
               pos[l[0]] = l[1].strip()
        f= open(s[0]+'.line',"r")
        data = f.read().splitlines()
        for line in data:
        #   print line
               l = line.split('@#%^&*')
               lines[l[0]] = l[1]





        srparser.init(texts,pos,dep,lines)
        # Parsing
        while not srparser.endparsing():
            # Generate features
            stack, queue = srparser.getstatus()
            # Make sure call the generator with
            # same arguments as in data generation part
            fg = FeatureGenerator(stack, queue)
            features = fg.features()
            labels = self.predict(features)
            # Enumerate through all possible actions ranked based on predcition scores
            for i,label in enumerate(labels):
                action = label2action(label)                
                try:
                    srparser.operate(action)
                    break # if legal action, end the loop
                except ActionError:
                    if i < len(labels): # if not a legal action, try the next possible action
                        continue
                    else:               
                        print "Parsing action error with {}".format(action)
                        sys.exit()
  
        tree = srparser.getparsetree()
        rst = RSTTree(tree=tree)
        return rst

Example #22

0

Show file

File: training_set_generator.py Project: xiaoyao1991/hmmpy

def get_training_samples(url):
    log_err('\tGetting Training sample')
    raw_results = router(url)
    log_err('\tData retrieved. Preprocessing...')
    observation_list = []
    label_list = []
    records = []

    feature_generator = FeatureGenerator()
    token_generator = Tokens()

    for raw_result in raw_results:
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []

        authors = raw_result['authors']
        title = raw_result['title']        
        title_copy = raw_result['title']

        try:
            venue = raw_result['conference name']
            venue_copy = raw_result['conference name']
        except:
            venue = ''
            venue_copy = ''
        try:
            venue = raw_result['journal name']
            venue_copy = raw_result['journal name']
        except:
            venue = ''
            venue_copy = ''

        if len(venue) > 0:
            try:
                volume = raw_result['volume']
            except:
                volume = ''
            try:
                issue = raw_result['issue']
            except:
                issue = ''
            try:
                page = raw_result['page']
            except:
                page = ''

            venue += ' ' + volume + ' ' + issue + ' ' + page
            venue_copy += ' ' + volume + ' ' + issue + ' ' + page


        date = raw_result['publication date'][:4]

        # FN: 0
        # LN: 1
        # DL: 2
        # TI: 3
        # VN: 4
        # DT: 5

        # Author -> Title -> ...
        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author += ' , '
            tmp_record += author
            tmp_label_list += [0] * (feature_generator.token_length(author)-2)
            tmp_label_list += [1,2]
                
        # title
        title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)

        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        # title
        # title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author += ' , '
            tmp_record += author
            tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
            tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))



        #=================================Variations of authors=================================
        # Changing order, inserting dot, and probably insert comma as delimiter inside of names
        # This part of variations is very sensitive to what sample source to choose from,
        # for example, Google scholar is the current source of samples, and on gscholar, 
        # most names are in format of JW Han.  <-- Prior knowledge
        # Read more Learn more Change the Globe !!!
        log_err('\tGenerating multiple cases for name variations... ')
        # ================================A. B
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        # authors
        for author in authors:
            if len(author) == 0:
                continue

            #???? BUG!!!! split() doesn't mean tokenization
            author_tokens = token_generator.tokenize(author)['tokens']  # Split the author in order tokens
            if len(author_tokens) == 1:     # Cannot change order or anything, so leave this name alone, and pass to the next name
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Insert dot
                author = author_tokens[0] + '.' + author_tokens[1] + ' , '  # A. B
                tmp_token_length = token_generator.token_length(author)
                tmp_record += author
                tmp_label_list += [0]*(tmp_token_length-2) + [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        # title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']  # Split the author in order to
            if len(author_tokens) == 1:     # Cannot change order or anything, so leave this name alone, and pass to the next name
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Insert dot
                author = author_tokens[0] + '.' + author_tokens[1] + ' , '  # A. B
                tmp_token_length = token_generator.token_length(author)
                tmp_record += author
                tmp_label_list += [0]*(tmp_token_length-2) + [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # ================================B, 
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Only keep lastname
                author = author_tokens[1] + ' , '  # B
                tmp_record += author
                tmp_label_list += [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        # title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Only keep lastname
                author = author_tokens[1] + ' , '  # B
                tmp_record += author
                tmp_label_list += [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))



        # ================================B A., 
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '.,'  # B A.,
                tmp_record += author
                tmp_label_list += [1,0,0,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        # title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '.,'  # B A.,
                tmp_record += author
                tmp_label_list += [1,0,0,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # ================================B A.
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '. '  # B A.
                tmp_record += author
                tmp_label_list += [1,0,0]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        # title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '. '  # B A.
                tmp_record += author
                tmp_label_list += [1,0,0]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))






        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        # Period Case!!!
        log_err('\tGenerating multiple cases for period as DL... ')
        # Author -> Title -> ...
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author += ' , '
            tmp_record += author
            tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
            tmp_label_list += [1,2]
                
        # title
        title = title_copy + ' . '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            venue = venue_copy + ' . '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))



        # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author += ' , '
            tmp_record += author
            tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
            tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # ================================A. B
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        # authors
        for author in authors:
            if len(author) == 0:
                continue

            author_tokens = token_generator.tokenize(author)['tokens']  # Split the author in order tokens
            if len(author_tokens) == 1:     # Cannot change order or anything, so leave this name alone, and pass to the next name
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Insert dot
                author = author_tokens[0] + '.' + author_tokens[1] + ' , '  # A. B
                tmp_token_length = token_generator.token_length(author)
                tmp_record += author
                tmp_label_list += [0]*(tmp_token_length-2) + [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']  # Split the author in order to
            if len(author_tokens) == 1:     # Cannot change order or anything, so leave this name alone, and pass to the next name
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Insert dot
                author = author_tokens[0] + '.' + author_tokens[1] + ' , '  # A. B
                tmp_token_length = token_generator.token_length(author)
                tmp_record += author
                tmp_label_list += [0]*(tmp_token_length-2) + [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # ================================B, 
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Only keep lastname
                author = author_tokens[1] + ' , '  # B
                tmp_record += author
                tmp_label_list += [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Only keep lastname
                author = author_tokens[1] + ' , '  # B
                tmp_record += author
                tmp_label_list += [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))



        # ================================B A., 
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '.,'  # B A.,
                tmp_record += author
                tmp_label_list += [1,0,0,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '.,'  # B A.,
                tmp_record += author
                tmp_label_list += [1,0,0,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

       
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # ================================B A.
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '. '  # B A.
                tmp_record += author
                tmp_label_list += [1,0,0]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '. '  # B A.
                tmp_record += author
                tmp_label_list += [1,0,0]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))





    # =============================================================================Verbose: Print the training set
    for record, observation, label in zip(records, observation_list, label_list):
        for rr, oo, ll in zip(token_generator.tokenize(record)['tokens'], observation, label):
            if ll == 0:
                ll = 'FN'
            elif ll == 1:
                ll = 'LN'
            elif ll == 2:
                ll = 'DL'
            elif ll == 3:
                ll = 'TI'
            elif ll == 4:
                ll = 'VN'
            elif ll == 5:
                ll = 'DT'
            print oo, '\t', ll.encode('utf-8'), '\t', rr.encode('utf-8')
        print '\n\n'

    return observation_list, label_list