def sr_parse(self, texts): """ Shift-reduce RST parsing based on model prediction :type texts: list of string :param texts: list of EDUs for parsing """ # Initialize parser srparser = SRParser([], []) srparser.init(texts) # Parsing while not srparser.endparsing(): # Generate features stack, queue = srparser.getstatus() # Make sure call the generator with # same arguments as in data generation part fg = FeatureGenerator(stack, queue) features = fg.features() label = self.predict(features) action = label2action(label) # The best choice here is to choose the first # legal action try: srparser.operate(action) except ActionError: print "Parsing action error with {}".format(action) sys.exit() tree = srparser.getparsetree() rst = RSTTree(tree=tree) return rst
def sr_parse(self, doc, bcvocab=None): """ Shift-reduce RST parsing based on model prediction :type texts: list of string :param texts: list of EDUs for parsing :type bcvocab: dict :param bcvocab: brown clusters """ # raise NotImplementedError("Not finished yet") # Initialize parser srparser = SRParser([], []) srparser.init(doc) # Parsing while not srparser.endparsing(): # Generate features stack, queue = srparser.getstatus() # Make sure call the generator with # same arguments as in data generation part fg = FeatureGenerator(stack, queue, doc, bcvocab) feat = fg.features() # label = self.predict(feat) labels = self.rank_labels(feat) for label in labels: action = label2action(label) try: srparser.operate(action) break except ActionError: # print "Parsing action error with {}".format(action) pass tree = srparser.getparsetree() rst = RSTTree() rst.asign_tree(tree) return rst
def sr_parse(self, texts): """ Shift-reduce RST parsing based on model prediction :type texts: list of string :param texts: list of EDUs for parsing """ # Initialize parser srparser = SRParser([],[]) srparser.init(texts) # Parsing while not srparser.endparsing(): # Generate features stack, queue = srparser.getstatus() # Make sure call the generator with # same arguments as in data generation part fg = FeatureGenerator(stack, queue) features = fg.features() labels = self.predict(features) # Enumerate through all possible actions ranked based on predcition scores for i,label in enumerate(labels): action = label2action(label) try: srparser.operate(action) break # if legal action, end the loop except ActionError: if i < len(labels): # if not a legal action, try the next possible action continue else: print "Parsing action error with {}".format(action) sys.exit() tree = srparser.getparsetree() rst = RSTTree(tree=tree) return rst
def sr_parse(self, texts, d_pos, d_dep): """ Shift-reduce RST parsing based on model prediction :type texts: list of string :param texts: list of EDUs for parsing """ # Initialize parser srparser = SRParser([],[]) srparser.init(texts, d_pos, d_dep) # Parsing while not srparser.endparsing(): # Generate features stack, queue = srparser.getstatus() # Make sure call the generator with # same arguments as in data generation part fg = FeatureGenerator(stack, queue) features = fg.features() label = self.predict(features) action = label2action(label) # The best choice here is to choose the first # legal action try: srparser.operate(action) except ActionError: print "Parsing action error with {}".format(action) sys.exit() tree = srparser.getparsetree() rst = RSTTree(tree=tree) return rst
def __init__(self, thresh=20, topn=100): """ Initialization """ self.vocab = {} self.features = defaultdict(float) self.thresh = thresh self.topn = topn self.fg = FeatureGenerator()
def __init__(self, vocab): """ Initialization """ self.vocab = vocab self.fg = FeatureGenerator() self.featdict = {} self.labels = []
def evaluate(self, raw_segment, log=True, normalize=False): # 1. Translate token list to feature vectors observation_sequence = FeatureGenerator(raw_segment).features # 2. Encode the feature vectors into integers encoded_sequence = [] for vector in observation_sequence: key = str(vector) encoded_symbol = self.feature_symbol_mapping[key] encoded_sequence.append(encoded_symbol) # print encoded_sequence print observation_sequence # 3. Call the forward algorithm likelihood, log_likelihood = self.forward(encoded_sequence) # 4. Normalize the score by length ???? not work if normalize: likelihood = float(len(observation_sequence)) * likelihood log_likelihood = math.log(float( len(observation_sequence))) + log_likelihood if log: return log_likelihood return likelihood
def get_training_samples_raw_cora(): fp = open(CORA_RAW_FILE_PATH, 'r') feature_generator = FeatureGenerator() observation_list = [] label_list = [] for data in fp: # Parse every piece of training data(single piece of tagged publication) soup = BeautifulSoup(data) tmp_observation_list = [] tmp_label_list = [] for child in soup.body.children: if type(child) is bs4.element.Tag: raw_label = child.name raw_text = child.text label = LABEL_INT_MAP[raw_label] # int label feature_vectors = feature_generator.build(raw_text) feature_generator.print_features() tmp_observation_list += feature_vectors tmp_label_list += [label] * len(feature_vectors) else: continue observation_list.append(tmp_observation_list) label_list.append(tmp_label_list) feature_generator.close_connection() return observation_list, label_list
class VocabGenerator(object): def __init__(self, thresh=20, topn=100): """ Initialization """ self.vocab = {} self.features = defaultdict(float) self.thresh = thresh self.topn = topn self.fg = FeatureGenerator() def build(self, doc): """ Extract features for a given doc :type doc: Doc instance :param doc: """ featdict = self.fg.extract(doc) for (idx, featlist) in featdict.iteritems(): for feat in featlist: self.features[feat] += 1.0 def select(self): """ Select top-n features according to frequency """ pass def filter(self): """ Filter out low-frequency features with thresh """ index = 0 for (feat, freq) in self.features.iteritems(): if freq >= self.thresh: self.vocab[feat] = index index += 1 def getvocab(self): """ Return vocab """ if len(self.vocab) == 0: raise ValueError("Empty vocab") return self.vocab def savevocab(self, fvocab): """ Dump vocab into a pickle file """ if not fvocab.endswith('pickle.gz'): fvocab += 'pickle.gz' fout = gzip.open(fvocab, 'w') if len(self.vocab) == 0: raise ValueError("Empty vocab") dump(self.vocab, fout) print "Save vocab into file: {}".format(fvocab)
def generate_samples(self): """ Generate samples from an binary RST tree """ # Sample list samplelist = [] # Parsing action actionlist = decodeSRaction(self.tree) # Initialize queue and stack queue = getedunode(self.tree) stack = [] # Start simulating the shift-reduce parsing for action in actionlist: # Generate features fg = FeatureGenerator(stack, queue) features = fg.features() samplelist.append(features) # Change status of stack/queue sr = SRParser(stack, queue) sr.operate(action) # stack, queue = sr.getstatus() return (actionlist, samplelist)
class SampleGenerator(object): def __init__(self, vocab): """ Initialization """ self.vocab = vocab self.fg = FeatureGenerator() self.featdict = {} self.labels = [] def build(self, doc): """ Build training examples from ONE doc """ N = len(self.featdict) index = 0 featdct = self.fg.extract(doc) for (gidx, featlist) in featdct.iteritems(): self.featdict[N+index] = featlist if doc.tokendict[gidx].boundary is not None: # No boundary indicator if doc.tokendict[gidx].boundary: self.labels.append(1) else: self.labels.append(0) index += 1 # print "Read {} samples".format(len(self.featdict)) # print len(self.featdict), len(self.labels) def getmat(self): """ Vectorize all elements in featdict """ nRow = len(self.featdict) nCol = len(self.vocab) Datadict = defaultdict(float) Ridx, Cidx, Val = [], [], [] for ridx in range(nRow): # if ridx % 10000 == 0: # print ridx for feat in self.featdict[ridx]: try: cidx = self.vocab[feat] Datadict[(ridx, cidx)] += 1.0 except KeyError: pass # Convert it to COO format for (key, val) in Datadict.iteritems(): Ridx.append(key[0]) Cidx.append(key[1]) Val.append(val) M = coo_matrix((Val, (Ridx,Cidx)), shape=(nRow,nCol)) # print 'Dim of matrix: {}'.format(M.shape) return (M, self.labels)
def decode_without_constraints(self, raw_segment): # 1. Translate token list to feature vectors fg = FeatureGenerator(raw_segment) observation_sequence = fg.features observation_tokens = fg.tokens # 2. Encode the feature vectors into integers encoded_sequence = [] for vector in observation_sequence: key = str(vector) encoded_symbol = self.feature_symbol_mapping[key] encoded_sequence.append(encoded_symbol) # 3. Call the viterbi algorithm decoded_label_sequence = self.viterbi(encoded_sequence) return observation_sequence, decoded_label_sequence
def run(self): i = 0 self.new_labels = [] for raw_segment, label_sequence in zip(self.raw_segments, self.label_sequences): new_labels = self.hmm_new.decode(raw_segment)[1] self.new_labels.append(new_labels) tokens = Tokens(raw_segment).tokens feature_vectors = FeatureGenerator(raw_segment).features print i, ': ', raw_segment for token, old_label, new_label, feature_vector in zip( tokens, label_sequence, new_labels, feature_vectors): print to_label(old_label), '\t', to_label( new_label), '\t', token self.feature_entity_list.add_entity( feature_vector, old_label, token) #???? Old label first print '\n' i += 1
def evaluate_exhaustive(self, raw_segment): # 1. Translate token list to feature vectors observation_sequence = FeatureGenerator(raw_segment).features # 2. Encode the feature vectors into integers encoded_sequence = [] for vector in observation_sequence: key = str(vector) encoded_symbol = self.feature_symbol_mapping[key] encoded_sequence.append(encoded_symbol) # print encoded_sequence print observation_sequence # 3. Get the space of all possible hidden state sequence sequence_length = len(observation_sequence) likelihood = 0.0 Y = get_binary_vector(self.observation_dim**sequence_length, sequence_length) for y in Y: likelihood += self.evaluate_helper(observation_sequence, y) return likelihood
def sr_parse(self, texts,fname): """ Shift-reduce RST parsing based on model prediction :type texts: list of string :param texts: list of EDUs for parsing """ # Initialize parser srparser = SRParser([],[]) dep = defaultdict() pos = defaultdict() lines =defaultdict() # print fname.split(".dis")[0]+'.dep' dir = fname.split s =fname.split(".edus") # print fname # st= fname if fname.endswith(".out.edus"): # print "yes" s= fname.split(".out.edus") f= open(s[0]+'.dep',"r") data = f.read().splitlines() for line in data: # print line l = line.split('@#%^&*') dep[l[0]] = l[1] f= open(s[0]+'.pos',"r") data = f.read().splitlines() for line in data: # print line l = line.split('@#%^&*') pos[l[0]] = l[1].strip() f= open(s[0]+'.line',"r") data = f.read().splitlines() for line in data: # print line l = line.split('@#%^&*') lines[l[0]] = l[1] srparser.init(texts,pos,dep,lines) # Parsing while not srparser.endparsing(): # Generate features stack, queue = srparser.getstatus() # Make sure call the generator with # same arguments as in data generation part fg = FeatureGenerator(stack, queue) features = fg.features() labels = self.predict(features) # Enumerate through all possible actions ranked based on predcition scores for i,label in enumerate(labels): action = label2action(label) try: srparser.operate(action) break # if legal action, end the loop except ActionError: if i < len(labels): # if not a legal action, try the next possible action continue else: print "Parsing action error with {}".format(action) sys.exit() tree = srparser.getparsetree() rst = RSTTree(tree=tree) return rst
def get_training_samples(url): log_err('\tGetting Training sample') raw_results = router(url) log_err('\tData retrieved. Preprocessing...') observation_list = [] label_list = [] records = [] feature_generator = FeatureGenerator() token_generator = Tokens() for raw_result in raw_results: tmp_record = '' tmp_observation_list = [] tmp_label_list = [] authors = raw_result['authors'] title = raw_result['title'] title_copy = raw_result['title'] try: venue = raw_result['conference name'] venue_copy = raw_result['conference name'] except: venue = '' venue_copy = '' try: venue = raw_result['journal name'] venue_copy = raw_result['journal name'] except: venue = '' venue_copy = '' if len(venue) > 0: try: volume = raw_result['volume'] except: volume = '' try: issue = raw_result['issue'] except: issue = '' try: page = raw_result['page'] except: page = '' venue += ' ' + volume + ' ' + issue + ' ' + page venue_copy += ' ' + volume + ' ' + issue + ' ' + page date = raw_result['publication date'][:4] # FN: 0 # LN: 1 # DL: 2 # TI: 3 # VN: 4 # DT: 5 # Author -> Title -> ... # authors for author in authors: if len(author) == 0: continue author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) tmp_label_list += [1,2] # title title += ' , ' tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) tmp_label_list += [2] # venue if len(venue) > 0: venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] # title # title += ' , ' tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: # venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) #=================================Variations of authors================================= # Changing order, inserting dot, and probably insert comma as delimiter inside of names # This part of variations is very sensitive to what sample source to choose from, # for example, Google scholar is the current source of samples, and on gscholar, # most names are in format of JW Han. <-- Prior knowledge # Read more Learn more Change the Globe !!! log_err('\tGenerating multiple cases for name variations... ') # ================================A. B tmp_record = '' tmp_observation_list = [] tmp_label_list = [] # authors for author in authors: if len(author) == 0: continue #???? BUG!!!! split() doesn't mean tokenization author_tokens = token_generator.tokenize(author)['tokens'] # Split the author in order tokens if len(author_tokens) == 1: # Cannot change order or anything, so leave this name alone, and pass to the next name author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Insert dot author = author_tokens[0] + '.' + author_tokens[1] + ' , ' # A. B tmp_token_length = token_generator.token_length(author) tmp_record += author tmp_label_list += [0]*(tmp_token_length-2) + [1,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # title # title += ' , ' tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # venue if len(venue) > 0: # venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] # Split the author in order to if len(author_tokens) == 1: # Cannot change order or anything, so leave this name alone, and pass to the next name author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Insert dot author = author_tokens[0] + '.' + author_tokens[1] + ' , ' # A. B tmp_token_length = token_generator.token_length(author) tmp_record += author tmp_label_list += [0]*(tmp_token_length-2) + [1,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: # venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # ================================B, # authors tmp_record = '' tmp_observation_list = [] tmp_label_list = [] for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Only keep lastname author = author_tokens[1] + ' , ' # B tmp_record += author tmp_label_list += [1,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # title # title += ' , ' tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # venue if len(venue) > 0: # venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Only keep lastname author = author_tokens[1] + ' , ' # B tmp_record += author tmp_label_list += [1,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: # venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # ================================B A., # authors tmp_record = '' tmp_observation_list = [] tmp_label_list = [] for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Change order and insert dot author = author_tokens[1] + ' ' + author_tokens[0] + '.,' # B A., tmp_record += author tmp_label_list += [1,0,0,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # title # title += ' , ' tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # venue if len(venue) > 0: # venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Change order and insert dot author = author_tokens[1] + ' ' + author_tokens[0] + '.,' # B A., tmp_record += author tmp_label_list += [1,0,0,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: # venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # ================================B A. # authors tmp_record = '' tmp_observation_list = [] tmp_label_list = [] for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Change order and insert dot author = author_tokens[1] + ' ' + author_tokens[0] + '. ' # B A. tmp_record += author tmp_label_list += [1,0,0] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # title # title += ' , ' tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # venue if len(venue) > 0: # venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Change order and insert dot author = author_tokens[1] + ' ' + author_tokens[0] + '. ' # B A. tmp_record += author tmp_label_list += [1,0,0] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: # venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) #============================================================================================================================================ #============================================================================================================================================ #============================================================================================================================================ #============================================================================================================================================ #============================================================================================================================================ #============================================================================================================================================ #============================================================================================================================================ #============================================================================================================================================ #============================================================================================================================================ # Period Case!!! log_err('\tGenerating multiple cases for period as DL... ') # Author -> Title -> ... # authors tmp_record = '' tmp_observation_list = [] tmp_label_list = [] for author in authors: if len(author) == 0: continue author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # title title = title_copy + ' . ' tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # venue if len(venue) > 0: venue = venue_copy + ' . ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # ================================A. B tmp_record = '' tmp_observation_list = [] tmp_label_list = [] # authors for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] # Split the author in order tokens if len(author_tokens) == 1: # Cannot change order or anything, so leave this name alone, and pass to the next name author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Insert dot author = author_tokens[0] + '.' + author_tokens[1] + ' , ' # A. B tmp_token_length = token_generator.token_length(author) tmp_record += author tmp_label_list += [0]*(tmp_token_length-2) + [1,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # title tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # venue if len(venue) > 0: tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] # Split the author in order to if len(author_tokens) == 1: # Cannot change order or anything, so leave this name alone, and pass to the next name author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Insert dot author = author_tokens[0] + '.' + author_tokens[1] + ' , ' # A. B tmp_token_length = token_generator.token_length(author) tmp_record += author tmp_label_list += [0]*(tmp_token_length-2) + [1,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # ================================B, # authors tmp_record = '' tmp_observation_list = [] tmp_label_list = [] for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Only keep lastname author = author_tokens[1] + ' , ' # B tmp_record += author tmp_label_list += [1,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # title tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # venue if len(venue) > 0: tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Only keep lastname author = author_tokens[1] + ' , ' # B tmp_record += author tmp_label_list += [1,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # ================================B A., # authors tmp_record = '' tmp_observation_list = [] tmp_label_list = [] for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Change order and insert dot author = author_tokens[1] + ' ' + author_tokens[0] + '.,' # B A., tmp_record += author tmp_label_list += [1,0,0,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # title tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # venue if len(venue) > 0: tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Change order and insert dot author = author_tokens[1] + ' ' + author_tokens[0] + '.,' # B A., tmp_record += author tmp_label_list += [1,0,0,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # ================================B A. # authors tmp_record = '' tmp_observation_list = [] tmp_label_list = [] for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Change order and insert dot author = author_tokens[1] + ' ' + author_tokens[0] + '. ' # B A. tmp_record += author tmp_label_list += [1,0,0] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # title tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # venue if len(venue) > 0: tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Change order and insert dot author = author_tokens[1] + ' ' + author_tokens[0] + '. ' # B A. tmp_record += author tmp_label_list += [1,0,0] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # =============================================================================Verbose: Print the training set for record, observation, label in zip(records, observation_list, label_list): for rr, oo, ll in zip(token_generator.tokenize(record)['tokens'], observation, label): if ll == 0: ll = 'FN' elif ll == 1: ll = 'LN' elif ll == 2: ll = 'DL' elif ll == 3: ll = 'TI' elif ll == 4: ll = 'VN' elif ll == 5: ll = 'DT' print oo, '\t', ll.encode('utf-8'), '\t', rr.encode('utf-8') print '\n\n' return observation_list, label_list