def send(self, data): if self.socket_to_server is None: utils.log_err( "Tried to send data to server, but connection is closed...") return if isinstance(data, str): json_data = data else: try: json_data = json.dumps(data, indent=4, separators=(',', ': ')) except TypeError: json_data = json.dumps(data.__dict__, indent=4, separators=(',', ': ')) sent = 0 try: sent = self.socket_to_server.send(json_data.encode('utf-8')) except socket.error: utils.log_err("Failed to send data!") if sent == 0: utils.log_wrn( "Unable to send data to server; client connection to '" + self.get_key() + "' broken! Auto-cleaning connection...") self.close() else: utils.log("Sent data '" + json_data + "' to '" + self.get_key() + "'")
def unsubscribe(event_name, listener): try: event_listeners[event_name].remove(listener) except KeyError: log_err("Trying to unsubscribe from non existent event: " + event_name) except ValueError: log_err("Trying to unsubscribe not subscribed listener from event: " + event_name)
def terminate(): global running_plugins for plugin in list(running_plugins): running_plugins.discard(plugin) try: plugin.terminate() except: log_err("Error terminating plugin " % plugin.__name__)
def get_scr(key): if isinstance(key, str) and key in Resources.scripts.keys(): return Resources.scripts[key] elif isinstance(key, str): utils.log_err("Tried to fetch script '" + str(key) + "' but no such one exists") else: utils.log_err("Tried to fetch script but passed key was not a string")
def get_grf(key): if isinstance(key, str) and key in Resources.graphics.keys(): return Resources.graphics[key] elif isinstance(key, str): utils.log_err("Tried to fetch graphics '" + key + "' but no such one exists") else: utils.log_err( "Tried to fetch graphics but passed key was not a string")
def start_console(self): utils.log("Starting console...") while self.console: try: user_entry = input("client >>") utils.log("Client Console: " + user_entry) exec(user_entry) except: utils.log_err("Error in console; ")
def publish(event_name, **kwargs): listeners = event_listeners.get(event_name, []) for listener in listeners[:]: try: result = listener(event_name, **kwargs) except: log_err("Unhandled exception raised by " + event_name + " event listener") continue if result: break # handling completed, skip other listeners
def main(): log_init() log_info('Starting controller') try: ctrl = Controller(CONTROLLER_CONFIG, PH_CONFIG, PUMP_X_CONFIG, PUMP_Y_CONFIG, SOLUTION_TANK_CONFIG, SUPPLY_TANK_CONFIG) ctrl.run() log_err('Controller stopped running') except Exception as e: log_err(str(e)) log_exception_trace()
def initialize(): global running_plugins for loader, name, isPkg in pkgutil.iter_modules(plugins.__path__): if name.startswith("_"): continue try: plugin = __import__("plugins." + name, globals(), locals(), ("plugins",)) except: log_err("Error importing plugin " + name) continue try: plugin.initialize() except: log_err("Error initializing plugin " + name) continue running_plugins.add(plugin)
def initialize(): global running_plugins for loader, name, isPkg in pkgutil.iter_modules(plugins.__path__): if name.startswith("_"): continue try: plugin = __import__("plugins." + name, globals(), locals(), ("plugins", )) except: log_err("Error importing plugin " + name) continue try: plugin.initialize() except: log_err("Error initializing plugin " + name) continue running_plugins.add(plugin)
def has_converged(self): if self.old_value is None: delta = abs(self.new_value) else: delta = abs(self.new_value - self.old_value) print 'delta: ', delta if delta <= self.tolerance: return True if self.iter >= self.max_iter: return True if self.new_value is None or abs(self.new_value) == float('inf'): log_err('\tProblematic converged!') return True return False
def start_receiving(self): try: plr_conn_pkg = players.PlayerConnectionPackage( env.Game.player_info) self.send(plr_conn_pkg) while self.listen: data = self.socket_to_server.recv( Client.RECEIVE_MSG_BUFFER_SIZE).decode('utf-8') try: json_data = json.loads(data) utils.log("Got object from server '" + self.get_key() + "':\n" + data) except ValueError: utils.log("Got message from server '" + self.get_key() + "':\n" + data) except socket.error: utils.log_err("Client message receiver thread stopped!")
def send(self, data): if isinstance(data, str): json_data = data else: json_data = json.dumps(data, indent=4, separators=(',', ': ')) sent = 0 try: sent = self.channel.send(json_data.encode('utf-8')) except socket.error: utils.log_err("Failed to send data!") if sent == 0: utils.log_wrn( "Unable to send data to client; server connection to '" + self.get_key() + "' broken! Auto-cleaning connection...") self.close() else: utils.log("Sent data '" + json_data + "' to '" + self.get_key() + "'")
def start_receiving(self): utils.log("Starting server message receiver thread...") conn_key = "?" try: self.connection_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.connection_socket.bind(tuple(self.server)) self.connection_socket.listen(5) utils.log("Ready for new incoming connections!") while self.open: c, addr = self.connection_socket.accept() conn_key = ":".join(str(i) for i in addr) utils.log("Receiving connection from " + conn_key + "...") self.connection_list[conn_key] = ServerConnection( addr, c, None, self) self.connection_list[conn_key].start() utils.log("Connection to " + conn_key + " open!") except socket.error: utils.log_err("Server message receiver thread stopped!") utils.log("Connection to " + conn_key + " closed.")
def building_binary_to_decimal_map(self): log_err('\tStart generating binary vectors...') possible_features = get_binary_vector(self.encoded_observation_dim, self.feature_dim) log_err('\tStart mapping feature vectors to int encoding...') for possible_feature in possible_features: self.feature_symbol_mapping[str(possible_feature)] = len(self.feature_symbol_mapping) log_err('\tFinish mapping...')
def building_binary_to_decimal_map(self): log_err('\tStart generating binary vectors...') possible_features = get_binary_vector(self.encoded_observation_dim, self.feature_dim) log_err('\tStart mapping feature vectors to int encoding...') for possible_feature in possible_features: self.feature_symbol_mapping[str(possible_feature)] = len( self.feature_symbol_mapping) log_err('\tFinish mapping...')
import pickle import ghmm from utils import get_binary_vector, log_err from ghmm import * FEATURE_DIM = 20 # ???? get rid of one of the feature which is FEATURE_SPACE = 2**FEATURE_DIM MAX_ITER = 10 MIN_IMPROVMENT = 0.1 # 1. Read in the data from Mac log_err('Reading data from pickle...') fp = open('data.pkl', 'rb') data = pickle.load(fp) fp.close() # 2. Create binary-decimal mapper log_err('Start generating binary vectors...') feature_symbol_mapping = {} possible_features = get_binary_vector(FEATURE_SPACE, FEATURE_DIM) log_err('Start mapping feature vectors to int encoding...') for possible_feature in possible_features: feature_symbol_mapping[str(possible_feature)] = len(feature_symbol_mapping) log_err('Finish mapping...') # 3. Map the features from bianry to decimal log_err('Converting the features') encoded_author_sequences = [] encoded_title_sequences = [] encoded_venue_sequences = []
import numpy as np from chmm import * from utils import get_binary_vector, log_err from training_set_generator import get_training_samples_BW FEATURE_DIM = 20 # ???? get rid of one of the feature which is FEATURE_SPACE = 2**FEATURE_DIM MAX_ITER = 10 MIN_IMPROVMENT = 0.1 # 1. Read in the data log_err('Reading data from retrieval...') data = get_training_samples_BW('http://scholar.google.com/citations?user=YU-baPIAAAAJ&hl=en', True) # 2. Create binary-decimal mapper log_err('Start generating binary vectors...') feature_symbol_mapping = {} possible_features = get_binary_vector(FEATURE_SPACE, FEATURE_DIM) log_err('Start mapping feature vectors to int encoding...') for possible_feature in possible_features: feature_symbol_mapping[str(possible_feature)] = len(feature_symbol_mapping) log_err('Finish mapping...') # 3. Map the features from bianry to decimal log_err('Converting the features') encoded_author_sequences = [] encoded_title_sequences = [] encoded_venue_sequences = [] for sample in data['author_sequences']:
def train(self, observation_sequences, label_sequences): # Setup information self.observation_sequences = observation_sequences self.label_sequences = label_sequences self.training_sample_size = len(label_sequences) self.feature_dim = len( observation_sequences[0][0]) # feature dimension self.encoded_observation_dim = self.observation_dim**self.feature_dim log_err('\tStart generating binary vectors...') possible_features = get_binary_vector(self.encoded_observation_dim, self.feature_dim) log_err('\tStart mapping feature vectors to int encoding...') for possible_feature in possible_features: self.feature_symbol_mapping[str(possible_feature)] = len( self.feature_symbol_mapping) log_err('\tFinish mapping...') pi = np.array([0.0] * self.state_dim) transitions = np.array([[0.0] * self.state_dim] * self.state_dim) emissions = np.array([[0.0] * self.encoded_observation_dim] * self.state_dim) pi_counter = np.array([0.0] * self.state_dim) transitions_counter = np.array([[0.0] * self.state_dim] * self.state_dim) emissions_counter = np.array([[0.0] * self.encoded_observation_dim] * self.state_dim) log_pi = np.array([0.0] * self.state_dim) log_transitions = np.array([[0.0] * self.state_dim] * self.state_dim) log_emissions = np.array([[0.0] * self.encoded_observation_dim] * self.state_dim) log_err('\tStart counting pi...') # 1. Counting first state in every label sequence to form pi for label_sequence in self.label_sequences: pi_counter[label_sequence[0]] += 1.0 log_err('\tStart counting transitions...') # 2. Count all state transitions for label_sequence in self.label_sequences: for j in range(1, len(label_sequence)): transitions_counter[label_sequence[j - 1]][ label_sequence[j]] += 1.0 log_err('\tStart counting emissions...') # 3. Count emissions for each label for i in range(self.training_sample_size): for j in range(len(self.observation_sequences[i])): symbol = self.feature_symbol_mapping[str( self.observation_sequences[i] [j])] # encode the feature vector into int emissions_counter[self.label_sequences[i][j]][symbol] += 1.0 log_err('\tStart forming log probability...') # 4. Form log probability, by using Laplace correction to avoid zero probabilities if self.useLaplaceRule: for i in range(self.state_dim): pi_counter[i] += 1.0 for j in range(self.state_dim): transitions_counter[i][j] += 1.0 for k in range(self.encoded_observation_dim): emissions_counter[i][k] += 1.0 pi_count = sum(pi_counter) transition_count = [ sum(transition) for transition in transitions_counter ] #???? emission_count = [sum(emission) for emission in emissions_counter] log_err('\tStart computing probability...') for i in range(len(pi_counter)): pi[i] = pi_counter[i] / pi_count log_pi[i] = math.log(pi_counter[i] / pi_count) for i in range(self.state_dim): for j in range(self.state_dim): transitions[i][ j] = transitions_counter[i][j] / transition_count[i] log_transitions[i][j] = math.log(transitions_counter[i][j] / transition_count[i]) for i in range(self.state_dim): for j in range(self.encoded_observation_dim): emissions[i][j] = emissions_counter[i][j] / emission_count[i] log_emissions[i][j] = math.log(emissions_counter[i][j] / emission_count[i]) self.pi = pi self.transitions = transitions self.emissions = emissions self.log_pi = log_pi self.log_transitions = log_transitions self.log_emissions = log_emissions
def train(self, observation_sequences, label_sequences): # Setup information self.observation_sequences = observation_sequences self.label_sequences = label_sequences self.training_sample_size = len(label_sequences) self.feature_dim = len(observation_sequences[0][0]) # feature dimension self.encoded_observation_dim = self.observation_dim ** self.feature_dim log_err('\tStart generating binary vectors...') possible_features = get_binary_vector(self.encoded_observation_dim, self.feature_dim) log_err('\tStart mapping feature vectors to int encoding...') for possible_feature in possible_features: self.feature_symbol_mapping[str(possible_feature)] = len(self.feature_symbol_mapping) log_err('\tFinish mapping...') pi = np.array([0.0] * self.state_dim) transitions = np.array([[0.0] * self.state_dim] * self.state_dim) emissions = np.array([[0.0] * self.encoded_observation_dim] * self.state_dim) pi_counter = np.array([0.0] * self.state_dim) transitions_counter = np.array([[0.0] * self.state_dim] * self.state_dim) emissions_counter = np.array([[0.0] * self.encoded_observation_dim] * self.state_dim) log_pi = np.array([0.0] * self.state_dim) log_transitions = np.array([[0.0] * self.state_dim] * self.state_dim) log_emissions = np.array([[0.0] * self.encoded_observation_dim] * self.state_dim) log_err('\tStart counting pi...') # 1. Counting first state in every label sequence to form pi for label_sequence in self.label_sequences: pi_counter[label_sequence[0]] += 1.0 log_err('\tStart counting transitions...') # 2. Count all state transitions for label_sequence in self.label_sequences: for j in range(1, len(label_sequence)): transitions_counter[label_sequence[j-1]][label_sequence[j]] += 1.0 log_err('\tStart counting emissions...') # 3. Count emissions for each label for i in range(self.training_sample_size): for j in range(len(self.observation_sequences[i])): symbol = self.feature_symbol_mapping[str(self.observation_sequences[i][j])] # encode the feature vector into int emissions_counter[self.label_sequences[i][j]][symbol] += 1.0 log_err('\tStart forming log probability...') # 4. Form log probability, by using Laplace correction to avoid zero probabilities if self.useLaplaceRule: for i in range(self.state_dim): pi_counter[i] += 1.0 for j in range(self.state_dim): transitions_counter[i][j] += 1.0 for k in range(self.encoded_observation_dim): emissions_counter[i][k] += 1.0 pi_count = sum(pi_counter) transition_count = [sum(transition) for transition in transitions_counter] #???? emission_count = [sum(emission) for emission in emissions_counter] log_err('\tStart computing probability...') for i in range(len(pi_counter)): pi[i] = pi_counter[i] / pi_count log_pi[i] = math.log(pi_counter[i] / pi_count) for i in range(self.state_dim): for j in range(self.state_dim): transitions[i][j] = transitions_counter[i][j] / transition_count[i] log_transitions[i][j] = math.log(transitions_counter[i][j] / transition_count[i]) for i in range(self.state_dim): for j in range(self.encoded_observation_dim): emissions[i][j] = emissions_counter[i][j] / emission_count[i] log_emissions[i][j] = math.log(emissions_counter[i][j] / emission_count[i]) self.pi = pi self.transitions = transitions self.emissions = emissions self.log_pi = log_pi self.log_transitions = log_transitions self.log_emissions = log_emissions
import numpy as np import hmm_faster from math import log from utils import get_binary_vector, log_err from training_set_generator import get_training_samples_BW FEATURE_DIM = 20 # ???? get rid of one of the feature which is FEATURE_SPACE = 2**FEATURE_DIM MAX_ITER = 10 MIN_IMPROVMENT = 0.1 # 1. Read in the data log_err('Reading data from retrieval...') # data = get_training_samples_BW('http://scholar.google.com/citations?user=YU-baPIAAAAJ&hl=en', True) data = get_training_samples_BW( 'http://scholar.google.com/citations?user=x3LTjz0AAAAJ&hl=en', True) # 2. Create binary-decimal mapper log_err('Start generating binary vectors...') feature_symbol_mapping = {} possible_features = get_binary_vector(FEATURE_SPACE, FEATURE_DIM) log_err('Start mapping feature vectors to int encoding...') for possible_feature in possible_features: feature_symbol_mapping[str(possible_feature)] = len(feature_symbol_mapping) log_err('Finish mapping...') # 3. Map the features from bianry to decimal log_err('Converting the features') encoded_author_sequences = [] encoded_title_sequences = []
= np.array([z1_uv]), np.array([z2_uv]), np.array([mst_uv]), np.array([msterr_uv]),\ np.array([phi_uv]), np.array([phierr_uv]), np.array([alp_uv]), np.array([alperr_uv]), np.array([ppr_n]) else: lngth = len(mst_uv) # print('-------------------------------------------------------------') print('Working on: ' + ppr_n[0]) print('-------------------------------------------------------------') # # Calculating SFRD # sfrd_uv = np.zeros(len(z1_uv)) sfrd_uv_err = np.zeros(len(z1_uv)) for j in range(len(z1_uv)): # Computing parameters array logphi, logphi_err = utl.log_err(phi_uv[j], phierr_uv[j]) mean_all = np.array([mst_uv[j], logphi, alp_uv[j]]) err_all = np.array([msterr_uv[j], logphi_err, alperr_uv[j]]) zcen = (z1_uv[j] + z2_uv[j]) / 2 #lst11 = utl.m_to_l_wave(mean_all[0], 1500) lt1 = 0.00001 / kap_uv sfr2, sfr2e = cov.sfrd_w_err(lum=lums_all, z=zcen, mean2=mean_all, err2=err_all, kappa=kap_uv, limit=lt1) sfrd_uv[j], sfrd_uv_err[j] = sfr2, sfr2e f22.write(ppr_n[0] + '\t' + str(z1_uv[j]) + '\t' + str(z2_uv[j]) + '\t' + str(sfr2) + '\t' + str(sfr2e) + '\n') #
def run_with_boosting_features(self): i = 0 self.new_labels = [] self.combined_labels = [] for raw_segment, label_sequence in zip(self.raw_segments, self.label_sequences): feature_vectors, new_labels = self.hmm_new.decode(raw_segment, True, True, self.token_BGM, self.pattern_BGM) self.new_labels.append(new_labels) tokens = Tokens(raw_segment).tokens print i, ': ', raw_segment # Combination step: tmp_combined_labels = [] # the decided combined labels so far for token, old_label, new_label, feature_vector in zip(tokens, label_sequence, new_labels, feature_vectors): # Combine old and new labels to come out a combined label, and deciding... combined_label = -1 if old_label == new_label: combined_label = new_label tmp_combined_labels.append(new_label) # Combine compatible labels: FN and LN elif old_label in [0,1] and new_label in [0,1]: combined_label = old_label tmp_combined_labels.append(new_label) # Combine labels that are not compatible else: tmp_feature_entity = self.hmm_new.feature_entity_list.lookup(feature_vector) # Get the Background knowledge provided the feature vector: the language feature model sorted_label_distribution = sorted(tmp_feature_entity.label_distribution.iteritems(), key=operator.itemgetter(1), reverse=True) total_label_occurence = float(sum(tmp[1] for tmp in sorted_label_distribution)) # ============================================================================================ # ============================================================================================ # ???? Experimenting: removing the low prob label distribution; FAILURE; ARCHIVED HERE AND DEPRECATED # sorted_label_distribution = [] # sum_prob = 0.0 # for pair in tmp_sorted_label_distribution: # sorted_label_distribution.append(pair) # sum_prob += pair[1] # if sum_prob/total_label_occurence >= 0.90: # break # ============================================================================================ # ============================================================================================ # Dominant label case: Iterate from the highest label stats according to this feature vector: for label_frequency in sorted_label_distribution: if int(label_frequency[0]) in [old_label, new_label] and (label_frequency[1]/total_label_occurence)>=self.DOMINANT_RATIO: print 'Dominant labels' # Check for constraint: tmp_label_to_check = int(label_frequency[0]) # Find last occurence position of this label if tmp_label_to_check not in [0,1]: last_occurence = ''.join([str(c) for c in tmp_combined_labels]).rfind(str(tmp_label_to_check)) elif tmp_label_to_check in [0,1]: last_occurence_0 = ''.join([str(c) for c in tmp_combined_labels]).rfind('0') last_occurence_1 = ''.join([str(c) for c in tmp_combined_labels]).rfind('1') last_occurence = max(last_occurence_0, last_occurence_1) # Checking constraints by simplifying what we did in viterbi if last_occurence == -1 or last_occurence == (len(tmp_combined_labels)-1): # Never occurred, or last occurence is the last label # When we are deciding the first label if len(tmp_combined_labels) == 0: first_bit = self.find_majority_structure()[0] if first_bit == 0 and tmp_label_to_check not in [0,1]: continue if first_bit == 3 and tmp_label_to_check != 3: continue # VN CANNOT FOLLOW TI W/O DL constraint if tmp_label_to_check == 4 and tmp_combined_labels[-1] == 3: continue elif tmp_label_to_check in [0,1]: flag = False for j in range(last_occurence, len(tmp_combined_labels)): if tmp_combined_labels[j] not in [0,1,2]: flag = True break if flag: continue elif tmp_label_to_check == 3: continue elif tmp_label_to_check == 4: if tmp_combined_labels[-1] == 3: #???? continue combined_label = tmp_label_to_check tmp_combined_labels.append(tmp_label_to_check) break # No dominance case OR Dominance-fail-due-to-constraint case: Find relatively if the label with higher possibility follow the constraint of publication order if combined_label == -1: # Iterate from the highest label stats according to this feature vector: for label_frequency in sorted_label_distribution: breakout_flag = False #Test against constraints # 1. DL separate labels principle # 2. AU-TI-VN Order if int(label_frequency[0]) in [old_label, new_label]: tmp_label_to_check = int(label_frequency[0]) # find structure of the order, and find what have appeared, and so predict what to be appear next structure_overview = [] #will record the order in big sense: 0,3,4/4,0,3 for tmp_combined_label in tmp_combined_labels: if tmp_combined_label in [2,5]: continue elif tmp_combined_label in [0,1]: if 0 in structure_overview: continue else: structure_overview.append(0) elif tmp_combined_label == 3: if 3 in structure_overview: continue else: structure_overview.append(3) elif tmp_combined_label == 4: if 4 in structure_overview: continue else: structure_overview.append(4) # Based on the structure overview, find what should appear next appear_next = [] if structure_overview == [0]: appear_next = [0,1,3,2,5] elif structure_overview == [3]: appear_next = [3,0,1,2,5] elif structure_overview == [0,3]: appear_next = [3,4,2,5] elif structure_overview == [3,0]: appear_next = [0,1,4,2,5] elif structure_overview == [0,3,4]: appear_next = [4,2,5] elif structure_overview == [3,0,4]: appear_next = [4,2,5] else: #weird case print 'Weird structure! Weird case!' if tmp_feature_entity.label_distribution[str(old_label)] > tmp_feature_entity.label_distribution[str(new_label)]: tmp_label_to_check_list = [old_label, new_label] else: tmp_label_to_check_list = [new_label, old_label] # Apply constraints here too for tmp_label_to_check in tmp_label_to_check_list: if tmp_label_to_check not in [0,1]: last_occurence = ''.join([str(c) for c in tmp_combined_labels]).rfind(str(tmp_label_to_check)) elif tmp_label_to_check in [0,1]: last_occurence_0 = ''.join([str(c) for c in tmp_combined_labels]).rfind('0') last_occurence_1 = ''.join([str(c) for c in tmp_combined_labels]).rfind('1') last_occurence = max(last_occurence_0, last_occurence_1) # Checking constraints by simplifying what we did in viterbi if last_occurence == -1 or last_occurence == (len(tmp_combined_labels)-1): # When we are deciding the first label if len(tmp_combined_labels) == 0: first_bit = self.find_majority_structure()[0] if first_bit == 0 and tmp_label_to_check not in [0,1]: continue if first_bit == 3 and tmp_label_to_check != 3: continue try: if tmp_label_to_check == 4 and tmp_combined_labels[-1] == 3: continue except: continue elif tmp_label_to_check in [0,1]: flag = False for j in range(last_occurence, len(tmp_combined_labels)): if tmp_combined_labels[j] not in [0,1,2]: flag = True break if flag: continue elif tmp_label_to_check == 3: continue elif tmp_label_to_check == 4: if tmp_combined_labels[-1] == 3: continue combined_label = tmp_label_to_check tmp_combined_labels.append(combined_label) breakout_flag = True break if breakout_flag: break if tmp_label_to_check in appear_next: # Then check constraint. find last occurence, DL constraints # Just need to check DL constraints, no need to verify more on tokens, assume token verification is done in the first iteration if tmp_label_to_check not in [0,1]: last_occurence = ''.join([str(c) for c in tmp_combined_labels]).rfind(str(tmp_label_to_check)) elif tmp_label_to_check in [0,1]: last_occurence_0 = ''.join([str(c) for c in tmp_combined_labels]).rfind('0') last_occurence_1 = ''.join([str(c) for c in tmp_combined_labels]).rfind('1') last_occurence = max(last_occurence_0, last_occurence_1) # Checking constraints by simplifying what we did in viterbi if last_occurence == -1 or last_occurence == (len(tmp_combined_labels)-1): if tmp_label_to_check == 4 and tmp_combined_labels[-1] == 3: #Hardcode rule [2013/07/23]: For VN, cannot directly follow a TI without DL???? may remove on real effect continue elif tmp_label_to_check in [0,1]: flag = False for j in range(last_occurence, len(tmp_combined_labels)): if tmp_combined_labels[j] not in [0,1,2]: flag = True break if flag: continue elif tmp_label_to_check == 3: continue # flag = False # for j in range(last_occurence, len(tmp_combined_labels)): # if tmp_combined_labels[j] not in [3,2]: # flag = True # break # if flag: # continue elif tmp_label_to_check == 4: if tmp_combined_labels[-1] == 3: #???? continue # elif tmp_label_to_check == 2: # elif tmp_label_to_check == 5: # Otherwise, pass log_err('\t\t' + str(i) + 'Should combine this one') combined_label = tmp_label_to_check tmp_combined_labels.append(tmp_label_to_check) # combined_label = (tmp_label_to_check, sorted_label_distribution) break else: continue # Debug if combined_label == -1: log_err(str(i) + 'problem') combined_label = (appear_next, sorted_label_distribution) tmp_combined_labels.append(-1) # Final check the accordance with the major order, ideally, all records under one domain should have the same order... PS very ugly code I admit print '==========================tmp_combined_labels', tmp_combined_labels majority_order_structure = self.find_majority_structure()[1] majority_rate = self.find_majority_structure()[2] tmp_combined_labels_length = len(tmp_combined_labels) if majority_rate > 0.80 and majority_order_structure == [0,3,4]: # p1(phase1): author segments for p1 in range(tmp_combined_labels_length): if tmp_combined_labels[p1] in [0,1,2,5]: continue else: break # p2(phase2): title segments for p2 in range(p1, tmp_combined_labels_length): if tmp_combined_labels[p2] == 3: continue else: break #p3(phase3): venue segments for p3 in range(p2, tmp_combined_labels_length): if tmp_combined_labels[p3] in [2,5,4]: continue else: break # Decision if p1 == 0: print 'Houston we got a SERIOUS problem!' log_err('Houston we got a SERIOUS problem!!!!!!!!') if p2 == p1: print 'Houston we got a problem!' for sp2 in range(p2, tmp_combined_labels_length): if tmp_combined_labels[sp2] != 2: tmp_combined_labels[sp2] = 3 else: break # should fix common mislabeling at this point now?????????? # elif majority_rate > 0.80 and majority_order_structure == [3,0,4]: # ???? not sure if this is normal # # p1(phase1): title segments # for p1 in range(tmp_combined_labels_length): # if tmp_combined_labels[p1] in [3]: # continue # else: # break # # p2(phase2): author segments # for p2 in range(p1, tmp_combined_labels_length): # if tmp_combined_labels[p2] == 3: # continue # else: # break # #p3(phase3): venue segments # for p3 in range(p2, tmp_combined_labels_length): # if tmp_combined_labels[p3] in [2,5,4]: # continue # else: # break # # Decision # if p1 == 0: # print 'Houston we got a SERIOUS problem!' # log_err('Houston we got a SERIOUS problem!!!!!!!!') # if p2 == p1: # print 'Houston we got a problem!' # for sp2 in range(p2, tmp_combined_labels_length): # if tmp_combined_labels[sp2] != 2: # tmp_combined_labels[sp2] = 3 # else: # break for old_label, new_label, tmp_combined_label, token, feature_vector in zip(label_sequence, new_labels, tmp_combined_labels, tokens, feature_vectors): print to_label(old_label), '\t', to_label(new_label), '\t', to_label(tmp_combined_label), '\t', token, '\t', feature_vector print '\n' i+=1
def start_receiving(self): try: while self.listen: try: data = self.channel.recv( Client.RECEIVE_MSG_BUFFER_SIZE).decode('utf-8') try: json_data = json.loads(data) # check for valid player connection package conn_dict = players.PlayerConnectionPackage.__dict__ is_valid_connection_pkg = True for key in json_data: if key not in conn_dict: is_valid_connection_pkg = False if is_valid_connection_pkg: # if the received package is a valid connection package, # read it and save it's contents to the current player # and set the player_is_ready flag to true! if self.player_is_ready: utils.log_wrn( "Received connection package from client '" + self.get_key() + "', but client already connected! Ignoring old package..." ) self.player = players.Player(json_data) else: utils.log( "Received connection package from client '" + self.get_key() + "'!") self.player = players.Player(json_data) self.player_is_ready = True else: # check for valid player input package input_dict = players.PlayerInputPackage.__dict__ is_valid_input_pkg = True for key in json_data: if key not in input_dict: is_valid_input_pkg = False if is_valid_input_pkg: utils.log_wrn("Debug - VALID INPUT PACKAGE!") else: utils.log_wrn( "Could not interpret client package...") raise ValueError except ValueError: # ValueError here means the package received was not a JSON object, # but rather a regular text message, used for events utils.log("Got message from client '" + self.get_key() + "':\n" + data) except (ConnectionResetError, ConnectionAbortedError, ConnectionRefusedError, ConnectionError): self.close() utils.log("Connection to client '" + self.get_key() + "' has been closed.") except socket.error: utils.log_err("Message receiver thread stopped for client '" + self.get_key() + "'!") self.close()
def get_training_samples(url): log_err('\tGetting Training sample') raw_results = router(url) log_err('\tData retrieved. Preprocessing...') observation_list = [] label_list = [] records = [] feature_generator = FeatureGenerator() token_generator = Tokens() for raw_result in raw_results: tmp_record = '' tmp_observation_list = [] tmp_label_list = [] authors = raw_result['authors'] title = raw_result['title'] title_copy = raw_result['title'] try: venue = raw_result['conference name'] venue_copy = raw_result['conference name'] except: venue = '' venue_copy = '' try: venue = raw_result['journal name'] venue_copy = raw_result['journal name'] except: venue = '' venue_copy = '' if len(venue) > 0: try: volume = raw_result['volume'] except: volume = '' try: issue = raw_result['issue'] except: issue = '' try: page = raw_result['page'] except: page = '' venue += ' ' + volume + ' ' + issue + ' ' + page venue_copy += ' ' + volume + ' ' + issue + ' ' + page date = raw_result['publication date'][:4] # FN: 0 # LN: 1 # DL: 2 # TI: 3 # VN: 4 # DT: 5 # Author -> Title -> ... # authors for author in authors: if len(author) == 0: continue author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) tmp_label_list += [1,2] # title title += ' , ' tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) tmp_label_list += [2] # venue if len(venue) > 0: venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] # title # title += ' , ' tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: # venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) #=================================Variations of authors================================= # Changing order, inserting dot, and probably insert comma as delimiter inside of names # This part of variations is very sensitive to what sample source to choose from, # for example, Google scholar is the current source of samples, and on gscholar, # most names are in format of JW Han. <-- Prior knowledge # Read more Learn more Change the Globe !!! log_err('\tGenerating multiple cases for name variations... ') # ================================A. B tmp_record = '' tmp_observation_list = [] tmp_label_list = [] # authors for author in authors: if len(author) == 0: continue #???? BUG!!!! split() doesn't mean tokenization author_tokens = token_generator.tokenize(author)['tokens'] # Split the author in order tokens if len(author_tokens) == 1: # Cannot change order or anything, so leave this name alone, and pass to the next name author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Insert dot author = author_tokens[0] + '.' + author_tokens[1] + ' , ' # A. B tmp_token_length = token_generator.token_length(author) tmp_record += author tmp_label_list += [0]*(tmp_token_length-2) + [1,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # title # title += ' , ' tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # venue if len(venue) > 0: # venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] # Split the author in order to if len(author_tokens) == 1: # Cannot change order or anything, so leave this name alone, and pass to the next name author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Insert dot author = author_tokens[0] + '.' + author_tokens[1] + ' , ' # A. B tmp_token_length = token_generator.token_length(author) tmp_record += author tmp_label_list += [0]*(tmp_token_length-2) + [1,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: # venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # ================================B, # authors tmp_record = '' tmp_observation_list = [] tmp_label_list = [] for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Only keep lastname author = author_tokens[1] + ' , ' # B tmp_record += author tmp_label_list += [1,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # title # title += ' , ' tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # venue if len(venue) > 0: # venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Only keep lastname author = author_tokens[1] + ' , ' # B tmp_record += author tmp_label_list += [1,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: # venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # ================================B A., # authors tmp_record = '' tmp_observation_list = [] tmp_label_list = [] for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Change order and insert dot author = author_tokens[1] + ' ' + author_tokens[0] + '.,' # B A., tmp_record += author tmp_label_list += [1,0,0,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # title # title += ' , ' tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # venue if len(venue) > 0: # venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Change order and insert dot author = author_tokens[1] + ' ' + author_tokens[0] + '.,' # B A., tmp_record += author tmp_label_list += [1,0,0,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: # venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # ================================B A. # authors tmp_record = '' tmp_observation_list = [] tmp_label_list = [] for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Change order and insert dot author = author_tokens[1] + ' ' + author_tokens[0] + '. ' # B A. tmp_record += author tmp_label_list += [1,0,0] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # title # title += ' , ' tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # venue if len(venue) > 0: # venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Change order and insert dot author = author_tokens[1] + ' ' + author_tokens[0] + '. ' # B A. tmp_record += author tmp_label_list += [1,0,0] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: # venue += ' , ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) #============================================================================================================================================ #============================================================================================================================================ #============================================================================================================================================ #============================================================================================================================================ #============================================================================================================================================ #============================================================================================================================================ #============================================================================================================================================ #============================================================================================================================================ #============================================================================================================================================ # Period Case!!! log_err('\tGenerating multiple cases for period as DL... ') # Author -> Title -> ... # authors tmp_record = '' tmp_observation_list = [] tmp_label_list = [] for author in authors: if len(author) == 0: continue author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # title title = title_copy + ' . ' tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # venue if len(venue) > 0: venue = venue_copy + ' . ' tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # ================================A. B tmp_record = '' tmp_observation_list = [] tmp_label_list = [] # authors for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] # Split the author in order tokens if len(author_tokens) == 1: # Cannot change order or anything, so leave this name alone, and pass to the next name author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Insert dot author = author_tokens[0] + '.' + author_tokens[1] + ' , ' # A. B tmp_token_length = token_generator.token_length(author) tmp_record += author tmp_label_list += [0]*(tmp_token_length-2) + [1,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # title tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # venue if len(venue) > 0: tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] # Split the author in order to if len(author_tokens) == 1: # Cannot change order or anything, so leave this name alone, and pass to the next name author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Insert dot author = author_tokens[0] + '.' + author_tokens[1] + ' , ' # A. B tmp_token_length = token_generator.token_length(author) tmp_record += author tmp_label_list += [0]*(tmp_token_length-2) + [1,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # ================================B, # authors tmp_record = '' tmp_observation_list = [] tmp_label_list = [] for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Only keep lastname author = author_tokens[1] + ' , ' # B tmp_record += author tmp_label_list += [1,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # title tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # venue if len(venue) > 0: tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Only keep lastname author = author_tokens[1] + ' , ' # B tmp_record += author tmp_label_list += [1,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # ================================B A., # authors tmp_record = '' tmp_observation_list = [] tmp_label_list = [] for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Change order and insert dot author = author_tokens[1] + ' ' + author_tokens[0] + '.,' # B A., tmp_record += author tmp_label_list += [1,0,0,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # title tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # venue if len(venue) > 0: tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Change order and insert dot author = author_tokens[1] + ' ' + author_tokens[0] + '.,' # B A., tmp_record += author tmp_label_list += [1,0,0,2] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # ================================B A. # authors tmp_record = '' tmp_observation_list = [] tmp_label_list = [] for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Change order and insert dot author = author_tokens[1] + ' ' + author_tokens[0] + '. ' # B A. tmp_record += author tmp_label_list += [1,0,0] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # title tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # venue if len(venue) > 0: tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # Title -> Author -> ... tmp_record = '' tmp_observation_list = [] tmp_label_list = [] tmp_record += title tmp_label_list += [3] * (feature_generator.token_length(title)-1) #!!!! tmp_label_list += [2] # authors for author in authors: if len(author) == 0: continue author_tokens = token_generator.tokenize(author)['tokens'] if len(author_tokens) == 1: author += ' , ' tmp_record += author tmp_label_list += [1,2] elif len(author_tokens) == 2: # Change order and insert dot author = author_tokens[1] + ' ' + author_tokens[0] + '. ' # B A. tmp_record += author tmp_label_list += [1,0,0] else: # name contains more than two tokens, just leave it for now author += ' , ' tmp_record += author tmp_label_list += [0] * (feature_generator.token_length(author)-2) #!!!! tmp_label_list += [1,2] # venue if len(venue) > 0: tmp_record += venue tmp_label_list += [4] * (feature_generator.token_length(venue)-1) #!!!! tmp_label_list += [2] # date if len(date) > 0: tmp_record += date tmp_label_list += [5] * feature_generator.token_length(date) #!!!! # Aggregate and append label_list.append(tmp_label_list) records.append(tmp_record) observation_list.append(feature_generator.build(tmp_record)) # =============================================================================Verbose: Print the training set for record, observation, label in zip(records, observation_list, label_list): for rr, oo, ll in zip(token_generator.tokenize(record)['tokens'], observation, label): if ll == 0: ll = 'FN' elif ll == 1: ll = 'LN' elif ll == 2: ll = 'DL' elif ll == 3: ll = 'TI' elif ll == 4: ll = 'VN' elif ll == 5: ll = 'DT' print oo, '\t', ll.encode('utf-8'), '\t', rr.encode('utf-8') print '\n\n' return observation_list, label_list
def run_with_boosting_features(self): i = 0 self.new_labels = [] self.combined_labels = [] for raw_segment, label_sequence in zip(self.raw_segments, self.label_sequences): feature_vectors, new_labels = self.hmm_new.decode( raw_segment, True, True, self.token_BGM, self.pattern_BGM) self.new_labels.append(new_labels) tokens = Tokens(raw_segment).tokens print i, ': ', raw_segment # Combination step: tmp_combined_labels = [] # the decided combined labels so far for token, old_label, new_label, feature_vector in zip( tokens, label_sequence, new_labels, feature_vectors): # Combine old and new labels to come out a combined label, and deciding... combined_label = -1 if old_label == new_label: combined_label = new_label tmp_combined_labels.append(new_label) # Combine compatible labels: FN and LN elif old_label in [0, 1] and new_label in [0, 1]: combined_label = old_label tmp_combined_labels.append(new_label) # Combine labels that are not compatible else: tmp_feature_entity = self.hmm_new.feature_entity_list.lookup( feature_vector ) # Get the Background knowledge provided the feature vector: the language feature model sorted_label_distribution = sorted( tmp_feature_entity.label_distribution.iteritems(), key=operator.itemgetter(1), reverse=True) total_label_occurence = float( sum(tmp[1] for tmp in sorted_label_distribution)) # ============================================================================================ # ============================================================================================ # ???? Experimenting: removing the low prob label distribution; FAILURE; ARCHIVED HERE AND DEPRECATED # sorted_label_distribution = [] # sum_prob = 0.0 # for pair in tmp_sorted_label_distribution: # sorted_label_distribution.append(pair) # sum_prob += pair[1] # if sum_prob/total_label_occurence >= 0.90: # break # ============================================================================================ # ============================================================================================ # Dominant label case: Iterate from the highest label stats according to this feature vector: for label_frequency in sorted_label_distribution: if int(label_frequency[0]) in [ old_label, new_label ] and (label_frequency[1] / total_label_occurence) >= self.DOMINANT_RATIO: print 'Dominant labels' # Check for constraint: tmp_label_to_check = int(label_frequency[0]) # Find last occurence position of this label if tmp_label_to_check not in [0, 1]: last_occurence = ''.join([ str(c) for c in tmp_combined_labels ]).rfind(str(tmp_label_to_check)) elif tmp_label_to_check in [0, 1]: last_occurence_0 = ''.join([ str(c) for c in tmp_combined_labels ]).rfind('0') last_occurence_1 = ''.join([ str(c) for c in tmp_combined_labels ]).rfind('1') last_occurence = max(last_occurence_0, last_occurence_1) # Checking constraints by simplifying what we did in viterbi if last_occurence == -1 or last_occurence == ( len(tmp_combined_labels) - 1 ): # Never occurred, or last occurence is the last label # When we are deciding the first label if len(tmp_combined_labels) == 0: first_bit = self.find_majority_structure( )[0] if first_bit == 0 and tmp_label_to_check not in [ 0, 1 ]: continue if first_bit == 3 and tmp_label_to_check != 3: continue # VN CANNOT FOLLOW TI W/O DL constraint if tmp_label_to_check == 4 and tmp_combined_labels[ -1] == 3: continue elif tmp_label_to_check in [0, 1]: flag = False for j in range(last_occurence, len(tmp_combined_labels)): if tmp_combined_labels[j] not in [0, 1, 2]: flag = True break if flag: continue elif tmp_label_to_check == 3: continue elif tmp_label_to_check == 4: if tmp_combined_labels[-1] == 3: #???? continue combined_label = tmp_label_to_check tmp_combined_labels.append(tmp_label_to_check) break # No dominance case OR Dominance-fail-due-to-constraint case: Find relatively if the label with higher possibility follow the constraint of publication order if combined_label == -1: # Iterate from the highest label stats according to this feature vector: for label_frequency in sorted_label_distribution: breakout_flag = False #Test against constraints # 1. DL separate labels principle # 2. AU-TI-VN Order if int(label_frequency[0]) in [ old_label, new_label ]: tmp_label_to_check = int(label_frequency[0]) # find structure of the order, and find what have appeared, and so predict what to be appear next structure_overview = [ ] #will record the order in big sense: 0,3,4/4,0,3 for tmp_combined_label in tmp_combined_labels: if tmp_combined_label in [2, 5]: continue elif tmp_combined_label in [0, 1]: if 0 in structure_overview: continue else: structure_overview.append(0) elif tmp_combined_label == 3: if 3 in structure_overview: continue else: structure_overview.append(3) elif tmp_combined_label == 4: if 4 in structure_overview: continue else: structure_overview.append(4) # Based on the structure overview, find what should appear next appear_next = [] if structure_overview == [0]: appear_next = [0, 1, 3, 2, 5] elif structure_overview == [3]: appear_next = [3, 0, 1, 2, 5] elif structure_overview == [0, 3]: appear_next = [3, 4, 2, 5] elif structure_overview == [3, 0]: appear_next = [0, 1, 4, 2, 5] elif structure_overview == [0, 3, 4]: appear_next = [4, 2, 5] elif structure_overview == [3, 0, 4]: appear_next = [4, 2, 5] else: #weird case print 'Weird structure! Weird case!' if tmp_feature_entity.label_distribution[str( old_label )] > tmp_feature_entity.label_distribution[ str(new_label)]: tmp_label_to_check_list = [ old_label, new_label ] else: tmp_label_to_check_list = [ new_label, old_label ] # Apply constraints here too for tmp_label_to_check in tmp_label_to_check_list: if tmp_label_to_check not in [0, 1]: last_occurence = ''.join([ str(c) for c in tmp_combined_labels ]).rfind(str(tmp_label_to_check)) elif tmp_label_to_check in [0, 1]: last_occurence_0 = ''.join([ str(c) for c in tmp_combined_labels ]).rfind('0') last_occurence_1 = ''.join([ str(c) for c in tmp_combined_labels ]).rfind('1') last_occurence = max( last_occurence_0, last_occurence_1) # Checking constraints by simplifying what we did in viterbi if last_occurence == -1 or last_occurence == ( len(tmp_combined_labels) - 1): # When we are deciding the first label if len(tmp_combined_labels) == 0: first_bit = self.find_majority_structure( )[0] if first_bit == 0 and tmp_label_to_check not in [ 0, 1 ]: continue if first_bit == 3 and tmp_label_to_check != 3: continue try: if tmp_label_to_check == 4 and tmp_combined_labels[ -1] == 3: continue except: continue elif tmp_label_to_check in [0, 1]: flag = False for j in range( last_occurence, len(tmp_combined_labels)): if tmp_combined_labels[ j] not in [0, 1, 2]: flag = True break if flag: continue elif tmp_label_to_check == 3: continue elif tmp_label_to_check == 4: if tmp_combined_labels[-1] == 3: continue combined_label = tmp_label_to_check tmp_combined_labels.append( combined_label) breakout_flag = True break if breakout_flag: break if tmp_label_to_check in appear_next: # Then check constraint. find last occurence, DL constraints # Just need to check DL constraints, no need to verify more on tokens, assume token verification is done in the first iteration if tmp_label_to_check not in [0, 1]: last_occurence = ''.join([ str(c) for c in tmp_combined_labels ]).rfind(str(tmp_label_to_check)) elif tmp_label_to_check in [0, 1]: last_occurence_0 = ''.join([ str(c) for c in tmp_combined_labels ]).rfind('0') last_occurence_1 = ''.join([ str(c) for c in tmp_combined_labels ]).rfind('1') last_occurence = max( last_occurence_0, last_occurence_1) # Checking constraints by simplifying what we did in viterbi if last_occurence == -1 or last_occurence == ( len(tmp_combined_labels) - 1): if tmp_label_to_check == 4 and tmp_combined_labels[ -1] == 3: #Hardcode rule [2013/07/23]: For VN, cannot directly follow a TI without DL???? may remove on real effect continue elif tmp_label_to_check in [0, 1]: flag = False for j in range( last_occurence, len(tmp_combined_labels)): if tmp_combined_labels[j] not in [ 0, 1, 2 ]: flag = True break if flag: continue elif tmp_label_to_check == 3: continue # flag = False # for j in range(last_occurence, len(tmp_combined_labels)): # if tmp_combined_labels[j] not in [3,2]: # flag = True # break # if flag: # continue elif tmp_label_to_check == 4: if tmp_combined_labels[-1] == 3: #???? continue # elif tmp_label_to_check == 2: # elif tmp_label_to_check == 5: # Otherwise, pass log_err('\t\t' + str(i) + 'Should combine this one') combined_label = tmp_label_to_check tmp_combined_labels.append( tmp_label_to_check) # combined_label = (tmp_label_to_check, sorted_label_distribution) break else: continue # Debug if combined_label == -1: log_err(str(i) + 'problem') combined_label = (appear_next, sorted_label_distribution) tmp_combined_labels.append(-1) # Final check the accordance with the major order, ideally, all records under one domain should have the same order... PS very ugly code I admit print '==========================tmp_combined_labels', tmp_combined_labels majority_order_structure = self.find_majority_structure()[1] majority_rate = self.find_majority_structure()[2] tmp_combined_labels_length = len(tmp_combined_labels) if majority_rate > 0.80 and majority_order_structure == [0, 3, 4]: # p1(phase1): author segments for p1 in range(tmp_combined_labels_length): if tmp_combined_labels[p1] in [0, 1, 2, 5]: continue else: break # p2(phase2): title segments for p2 in range(p1, tmp_combined_labels_length): if tmp_combined_labels[p2] == 3: continue else: break #p3(phase3): venue segments for p3 in range(p2, tmp_combined_labels_length): if tmp_combined_labels[p3] in [2, 5, 4]: continue else: break # Decision if p1 == 0: print 'Houston we got a SERIOUS problem!' log_err('Houston we got a SERIOUS problem!!!!!!!!') if p2 == p1: print 'Houston we got a problem!' for sp2 in range(p2, tmp_combined_labels_length): if tmp_combined_labels[sp2] != 2: tmp_combined_labels[sp2] = 3 else: break # should fix common mislabeling at this point now?????????? # elif majority_rate > 0.80 and majority_order_structure == [3,0,4]: # ???? not sure if this is normal # # p1(phase1): title segments # for p1 in range(tmp_combined_labels_length): # if tmp_combined_labels[p1] in [3]: # continue # else: # break # # p2(phase2): author segments # for p2 in range(p1, tmp_combined_labels_length): # if tmp_combined_labels[p2] == 3: # continue # else: # break # #p3(phase3): venue segments # for p3 in range(p2, tmp_combined_labels_length): # if tmp_combined_labels[p3] in [2,5,4]: # continue # else: # break # # Decision # if p1 == 0: # print 'Houston we got a SERIOUS problem!' # log_err('Houston we got a SERIOUS problem!!!!!!!!') # if p2 == p1: # print 'Houston we got a problem!' # for sp2 in range(p2, tmp_combined_labels_length): # if tmp_combined_labels[sp2] != 2: # tmp_combined_labels[sp2] = 3 # else: # break for old_label, new_label, tmp_combined_label, token, feature_vector in zip( label_sequence, new_labels, tmp_combined_labels, tokens, feature_vectors): print to_label(old_label), '\t', to_label( new_label), '\t', to_label( tmp_combined_label), '\t', token, '\t', feature_vector print '\n' i += 1
def init(client=False): utils.log("Loading settings...") # check if settings file exists if os.path.isfile(Environment.SETTINGS_FILE_PATH): # exists - read it with open(Environment.SETTINGS_FILE_PATH) as settings_file: data = json.load(settings_file) # check whether all setting keys exist in the opened file bad = False try: for settings_key in Game.settings: if settings_key not in data: # if a key is missing, log it and recreate the default settings file utils.log_wrn("Missing settings key '" + settings_key + "'! Reverting to default settings...") json_data = json.dumps(Game.settings, indent=4, separators=(',', ':')) dump_file = open(Environment.SETTINGS_FILE_PATH, "w") dump_file.write(json_data) dump_file.close() bad = True break except ValueError: utils.log_err("Failed to read settings file; bad JSON format") # set settings if not bad: Game.settings = data else: # does not exist - create it utils.log("No settings file, creating new file with defaults...") json_data = json.dumps(Game.settings, indent=4, separators=(',', ':')) dump_file = open(Environment.SETTINGS_FILE_PATH, "w") dump_file.write(json_data) dump_file.close() utils.log_err("Failed to create default settings file!") utils.log("Loading bindings...") # check if bindings file exists if os.path.isfile(Environment.BINDINGS_FILE_PATH): # exists - read it with open(Environment.BINDINGS_FILE_PATH) as bindings_file: data = json.load(bindings_file) # check whether all setting keys exist in the opened file bad = False try: for binding_key in Game.bindings: if binding_key not in data: # if a key is missing, log it and recreate the default settings file utils.log_wrn("Missing binding key '" + binding_key + "'! Reverting to default settings...") json_data = json.dumps(Game.bindings, indent=4, separators=(',', ':')) dump_file = open(Environment.BINDINGS_FILE_PATH, "w") dump_file.write(json_data) dump_file.close() bad = True break except ValueError: utils.log_err("Failed to read bindings file; bad JSON format") # set settings if not bad: Game.bindings = data else: # does not exist - create it utils.log("No bindings file, creating new file with defaults...") json_data = json.dumps(Game.bindings, indent=4, separators=(',', ':')) dump_file = open(Environment.BINDINGS_FILE_PATH, "w") dump_file.write(json_data) dump_file.close() utils.log_err("Failed to create default bindings file!") # if the client flag is set, do not initialize pygame if not client: utils.log("Loading player info...") # check if player info file exists if os.path.isfile(Environment.PLAYER_INFO_FILE_PATH): # exists - read it with open( Environment.PLAYER_INFO_FILE_PATH) as player_info_file: data = json.load(player_info_file) # check whether all player info keys exist in the opened file bad = False try: for binding_key in Game.player_info: if binding_key not in data: # if a key is missing, log it and recreate the default settings file utils.log_wrn( "Missing player info key '" + binding_key + "'! Reverting to default settings...") json_data = json.dumps(Game.player_info, indent=4, separators=(',', ':')) dump_file = open(Environment.PLAYER_INFO_FILE_PATH, "w") dump_file.write(json_data) dump_file.close() bad = True break except ValueError: utils.log_err( "Failed to read player info file; bad JSON format") # set settings if not bad: Game.player_info = data else: # does not exist - create it utils.log( "No player info file, creating new file with defaults...") json_data = json.dumps(Game.player_info, indent=4, separators=(',', ':')) dump_file = open(Environment.PLAYER_INFO_FILE_PATH, "w") dump_file.write(json_data) dump_file.close() utils.log_err("Failed to create default player info file!") # initialize pygame utils.log("Initializing PyGame library...") pygame.mixer.pre_init(44100, 16, 2, 4096) pygame.init() Game.screen = pygame.display.set_mode(Game.settings["screen_size"]) pygame.display.set_caption(Game.settings["title"]) pygame.display.set_icon(pygame.image.load(Game.settings["icon"])) utils.log("PyGame library initialized!")