Exemple #1
0
    def send(self, data):
        if self.socket_to_server is None:
            utils.log_err(
                "Tried to send data to server, but connection is closed...")
            return

        if isinstance(data, str):
            json_data = data
        else:
            try:
                json_data = json.dumps(data, indent=4, separators=(',', ': '))
            except TypeError:
                json_data = json.dumps(data.__dict__,
                                       indent=4,
                                       separators=(',', ': '))

        sent = 0
        try:
            sent = self.socket_to_server.send(json_data.encode('utf-8'))
        except socket.error:
            utils.log_err("Failed to send data!")

        if sent == 0:
            utils.log_wrn(
                "Unable to send data to server; client connection to '" +
                self.get_key() + "' broken! Auto-cleaning connection...")
            self.close()
        else:
            utils.log("Sent data '" + json_data + "' to '" + self.get_key() +
                      "'")
Exemple #2
0
def unsubscribe(event_name, listener):
    try:
        event_listeners[event_name].remove(listener)
    except KeyError:
        log_err("Trying to unsubscribe from non existent event: " + event_name)
    except ValueError:
        log_err("Trying to unsubscribe not subscribed listener from event: " + event_name)
Exemple #3
0
def terminate():
    global running_plugins
    for plugin in list(running_plugins):
        running_plugins.discard(plugin)
        try:
            plugin.terminate()
        except:
            log_err("Error terminating plugin " % plugin.__name__)
Exemple #4
0
def unsubscribe(event_name, listener):
    try:
        event_listeners[event_name].remove(listener)
    except KeyError:
        log_err("Trying to unsubscribe from non existent event: " + event_name)
    except ValueError:
        log_err("Trying to unsubscribe not subscribed listener from event: " +
                event_name)
Exemple #5
0
def get_scr(key):
    if isinstance(key, str) and key in Resources.scripts.keys():
        return Resources.scripts[key]
    elif isinstance(key, str):
        utils.log_err("Tried to fetch script '" + str(key) +
                      "' but no such one exists")
    else:
        utils.log_err("Tried to fetch script but passed key was not a string")
Exemple #6
0
def terminate():
    global running_plugins
    for plugin in list(running_plugins):
        running_plugins.discard(plugin)
        try:
            plugin.terminate()
        except:
            log_err("Error terminating plugin " % plugin.__name__)
Exemple #7
0
def get_grf(key):
    if isinstance(key, str) and key in Resources.graphics.keys():
        return Resources.graphics[key]
    elif isinstance(key, str):
        utils.log_err("Tried to fetch graphics '" + key +
                      "' but no such one exists")
    else:
        utils.log_err(
            "Tried to fetch graphics but passed key was not a string")
Exemple #8
0
 def start_console(self):
     utils.log("Starting console...")
     while self.console:
         try:
             user_entry = input("client >>")
             utils.log("Client Console: " + user_entry)
             exec(user_entry)
         except:
             utils.log_err("Error in console; ")
Exemple #9
0
def publish(event_name, **kwargs):
    listeners = event_listeners.get(event_name, [])
    for listener in listeners[:]:
        try:
            result = listener(event_name, **kwargs)
        except:
            log_err("Unhandled exception raised by " + event_name + " event listener")
            continue
        if result:
            break  # handling completed, skip other listeners
Exemple #10
0
def publish(event_name, **kwargs):
    listeners = event_listeners.get(event_name, [])
    for listener in listeners[:]:
        try:
            result = listener(event_name, **kwargs)
        except:
            log_err("Unhandled exception raised by " + event_name +
                    " event listener")
            continue
        if result:
            break  # handling completed, skip other listeners
Exemple #11
0
def main():
    log_init()
    log_info('Starting controller')

    try:
        ctrl = Controller(CONTROLLER_CONFIG, PH_CONFIG, PUMP_X_CONFIG,
                          PUMP_Y_CONFIG, SOLUTION_TANK_CONFIG,
                          SUPPLY_TANK_CONFIG)
        ctrl.run()

        log_err('Controller stopped running')
    except Exception as e:
        log_err(str(e))
        log_exception_trace()
Exemple #12
0
def initialize():
    global running_plugins
    for loader, name, isPkg in pkgutil.iter_modules(plugins.__path__):
        if name.startswith("_"):
            continue
        try:
            plugin = __import__("plugins." + name, globals(), locals(), ("plugins",))
        except:
            log_err("Error importing plugin " + name)
            continue
        try:
            plugin.initialize()
        except:
            log_err("Error initializing plugin " + name)
            continue
        running_plugins.add(plugin)
Exemple #13
0
def initialize():
    global running_plugins
    for loader, name, isPkg in pkgutil.iter_modules(plugins.__path__):
        if name.startswith("_"):
            continue
        try:
            plugin = __import__("plugins." + name, globals(), locals(),
                                ("plugins", ))
        except:
            log_err("Error importing plugin " + name)
            continue
        try:
            plugin.initialize()
        except:
            log_err("Error initializing plugin " + name)
            continue
        running_plugins.add(plugin)
Exemple #14
0
    def has_converged(self):
        if self.old_value is None:
            delta = abs(self.new_value)
        else:
            delta = abs(self.new_value - self.old_value)

        print 'delta: ', delta
        if delta <= self.tolerance:
            return True

        if self.iter >= self.max_iter:
            return True

        if self.new_value is None or abs(self.new_value) == float('inf'):
            log_err('\tProblematic converged!')
            return True
        return False
Exemple #15
0
    def has_converged(self):
        if self.old_value is None:
            delta = abs(self.new_value)
        else:
            delta = abs(self.new_value - self.old_value)

        print 'delta: ', delta
        if delta <= self.tolerance:
            return True

        if self.iter >= self.max_iter:
            return True

        if self.new_value is None or abs(self.new_value) == float('inf'):
            log_err('\tProblematic converged!')
            return True
        return False
Exemple #16
0
    def start_receiving(self):
        try:
            plr_conn_pkg = players.PlayerConnectionPackage(
                env.Game.player_info)
            self.send(plr_conn_pkg)

            while self.listen:
                data = self.socket_to_server.recv(
                    Client.RECEIVE_MSG_BUFFER_SIZE).decode('utf-8')
                try:
                    json_data = json.loads(data)
                    utils.log("Got object from server '" + self.get_key() +
                              "':\n" + data)
                except ValueError:
                    utils.log("Got message from server '" + self.get_key() +
                              "':\n" + data)
        except socket.error:
            utils.log_err("Client message receiver thread stopped!")
Exemple #17
0
    def send(self, data):
        if isinstance(data, str):
            json_data = data
        else:
            json_data = json.dumps(data, indent=4, separators=(',', ': '))

        sent = 0
        try:
            sent = self.channel.send(json_data.encode('utf-8'))
        except socket.error:
            utils.log_err("Failed to send data!")

        if sent == 0:
            utils.log_wrn(
                "Unable to send data to client; server connection to '" +
                self.get_key() + "' broken! Auto-cleaning connection...")
            self.close()
        else:
            utils.log("Sent data '" + json_data + "' to '" + self.get_key() +
                      "'")
Exemple #18
0
    def start_receiving(self):
        utils.log("Starting server message receiver thread...")
        conn_key = "?"

        try:
            self.connection_socket = socket.socket(socket.AF_INET,
                                                   socket.SOCK_STREAM)
            self.connection_socket.bind(tuple(self.server))
            self.connection_socket.listen(5)
            utils.log("Ready for new incoming connections!")
            while self.open:
                c, addr = self.connection_socket.accept()
                conn_key = ":".join(str(i) for i in addr)
                utils.log("Receiving connection from " + conn_key + "...")
                self.connection_list[conn_key] = ServerConnection(
                    addr, c, None, self)
                self.connection_list[conn_key].start()
                utils.log("Connection to " + conn_key + " open!")
        except socket.error:
            utils.log_err("Server message receiver thread stopped!")

        utils.log("Connection to " + conn_key + " closed.")
Exemple #19
0
    def building_binary_to_decimal_map(self):
        log_err('\tStart generating binary vectors...')
        possible_features = get_binary_vector(self.encoded_observation_dim, self.feature_dim)

        log_err('\tStart mapping feature vectors to int encoding...')
        for possible_feature in possible_features:
            self.feature_symbol_mapping[str(possible_feature)] = len(self.feature_symbol_mapping)
        log_err('\tFinish mapping...')
Exemple #20
0
    def building_binary_to_decimal_map(self):
        log_err('\tStart generating binary vectors...')
        possible_features = get_binary_vector(self.encoded_observation_dim,
                                              self.feature_dim)

        log_err('\tStart mapping feature vectors to int encoding...')
        for possible_feature in possible_features:
            self.feature_symbol_mapping[str(possible_feature)] = len(
                self.feature_symbol_mapping)
        log_err('\tFinish mapping...')
Exemple #21
0
import pickle
import ghmm
from utils import get_binary_vector, log_err
from ghmm import *

FEATURE_DIM = 20    # ???? get rid of one of the feature which is 
FEATURE_SPACE = 2**FEATURE_DIM

MAX_ITER = 10
MIN_IMPROVMENT = 0.1

# 1. Read in the data from Mac
log_err('Reading data from pickle...')
fp = open('data.pkl', 'rb')
data = pickle.load(fp)
fp.close()

# 2. Create binary-decimal mapper
log_err('Start generating binary vectors...')
feature_symbol_mapping = {}
possible_features = get_binary_vector(FEATURE_SPACE, FEATURE_DIM)
log_err('Start mapping feature vectors to int encoding...')
for possible_feature in possible_features:
    feature_symbol_mapping[str(possible_feature)] = len(feature_symbol_mapping)
log_err('Finish mapping...')

# 3. Map the features from bianry to decimal
log_err('Converting the features')
encoded_author_sequences = []
encoded_title_sequences = []
encoded_venue_sequences = []
Exemple #22
0
import numpy as np
from chmm import *
from utils import get_binary_vector, log_err
from training_set_generator import get_training_samples_BW

FEATURE_DIM = 20    # ???? get rid of one of the feature which is 
FEATURE_SPACE = 2**FEATURE_DIM

MAX_ITER = 10
MIN_IMPROVMENT = 0.1

# 1. Read in the data
log_err('Reading data from retrieval...')
data = get_training_samples_BW('http://scholar.google.com/citations?user=YU-baPIAAAAJ&hl=en', True)

# 2. Create binary-decimal mapper
log_err('Start generating binary vectors...')
feature_symbol_mapping = {}
possible_features = get_binary_vector(FEATURE_SPACE, FEATURE_DIM)
log_err('Start mapping feature vectors to int encoding...')
for possible_feature in possible_features:
    feature_symbol_mapping[str(possible_feature)] = len(feature_symbol_mapping)
log_err('Finish mapping...')

# 3. Map the features from bianry to decimal
log_err('Converting the features')
encoded_author_sequences = []
encoded_title_sequences = []
encoded_venue_sequences = []

for sample in data['author_sequences']:
Exemple #23
0
    def train(self, observation_sequences, label_sequences):
        # Setup information
        self.observation_sequences = observation_sequences
        self.label_sequences = label_sequences
        self.training_sample_size = len(label_sequences)
        self.feature_dim = len(
            observation_sequences[0][0])  # feature dimension
        self.encoded_observation_dim = self.observation_dim**self.feature_dim

        log_err('\tStart generating binary vectors...')
        possible_features = get_binary_vector(self.encoded_observation_dim,
                                              self.feature_dim)

        log_err('\tStart mapping feature vectors to int encoding...')
        for possible_feature in possible_features:
            self.feature_symbol_mapping[str(possible_feature)] = len(
                self.feature_symbol_mapping)
        log_err('\tFinish mapping...')

        pi = np.array([0.0] * self.state_dim)
        transitions = np.array([[0.0] * self.state_dim] * self.state_dim)
        emissions = np.array([[0.0] * self.encoded_observation_dim] *
                             self.state_dim)

        pi_counter = np.array([0.0] * self.state_dim)
        transitions_counter = np.array([[0.0] * self.state_dim] *
                                       self.state_dim)
        emissions_counter = np.array([[0.0] * self.encoded_observation_dim] *
                                     self.state_dim)

        log_pi = np.array([0.0] * self.state_dim)
        log_transitions = np.array([[0.0] * self.state_dim] * self.state_dim)
        log_emissions = np.array([[0.0] * self.encoded_observation_dim] *
                                 self.state_dim)

        log_err('\tStart counting pi...')
        # 1. Counting first state in every label sequence to form pi
        for label_sequence in self.label_sequences:
            pi_counter[label_sequence[0]] += 1.0

        log_err('\tStart counting transitions...')
        # 2. Count all state transitions
        for label_sequence in self.label_sequences:
            for j in range(1, len(label_sequence)):
                transitions_counter[label_sequence[j - 1]][
                    label_sequence[j]] += 1.0

        log_err('\tStart counting emissions...')
        # 3. Count emissions for each label
        for i in range(self.training_sample_size):
            for j in range(len(self.observation_sequences[i])):
                symbol = self.feature_symbol_mapping[str(
                    self.observation_sequences[i]
                    [j])]  # encode the feature vector into int
                emissions_counter[self.label_sequences[i][j]][symbol] += 1.0

        log_err('\tStart forming log probability...')
        # 4. Form log probability, by using Laplace correction to avoid zero probabilities
        if self.useLaplaceRule:
            for i in range(self.state_dim):
                pi_counter[i] += 1.0

                for j in range(self.state_dim):
                    transitions_counter[i][j] += 1.0
                for k in range(self.encoded_observation_dim):
                    emissions_counter[i][k] += 1.0

        pi_count = sum(pi_counter)
        transition_count = [
            sum(transition) for transition in transitions_counter
        ]  #????
        emission_count = [sum(emission) for emission in emissions_counter]
        log_err('\tStart computing probability...')
        for i in range(len(pi_counter)):
            pi[i] = pi_counter[i] / pi_count
            log_pi[i] = math.log(pi_counter[i] / pi_count)
        for i in range(self.state_dim):
            for j in range(self.state_dim):
                transitions[i][
                    j] = transitions_counter[i][j] / transition_count[i]
                log_transitions[i][j] = math.log(transitions_counter[i][j] /
                                                 transition_count[i])
        for i in range(self.state_dim):
            for j in range(self.encoded_observation_dim):
                emissions[i][j] = emissions_counter[i][j] / emission_count[i]
                log_emissions[i][j] = math.log(emissions_counter[i][j] /
                                               emission_count[i])

        self.pi = pi
        self.transitions = transitions
        self.emissions = emissions
        self.log_pi = log_pi
        self.log_transitions = log_transitions
        self.log_emissions = log_emissions
Exemple #24
0
    def train(self, observation_sequences, label_sequences):
        # Setup information
        self.observation_sequences = observation_sequences
        self.label_sequences = label_sequences
        self.training_sample_size = len(label_sequences)
        self.feature_dim = len(observation_sequences[0][0])  # feature dimension
        self.encoded_observation_dim = self.observation_dim ** self.feature_dim

        log_err('\tStart generating binary vectors...')
        possible_features = get_binary_vector(self.encoded_observation_dim, self.feature_dim)


        log_err('\tStart mapping feature vectors to int encoding...')
        for possible_feature in possible_features:
            self.feature_symbol_mapping[str(possible_feature)] = len(self.feature_symbol_mapping)
        log_err('\tFinish mapping...')

        pi = np.array([0.0] * self.state_dim)
        transitions = np.array([[0.0] * self.state_dim] * self.state_dim)
        emissions = np.array([[0.0] * self.encoded_observation_dim] * self.state_dim)

        pi_counter = np.array([0.0] * self.state_dim)
        transitions_counter = np.array([[0.0] * self.state_dim] * self.state_dim)
        emissions_counter = np.array([[0.0] * self.encoded_observation_dim] * self.state_dim)

        log_pi = np.array([0.0] * self.state_dim)
        log_transitions = np.array([[0.0] * self.state_dim] * self.state_dim)
        log_emissions = np.array([[0.0] * self.encoded_observation_dim] * self.state_dim)

        log_err('\tStart counting pi...')
        # 1. Counting first state in every label sequence to form pi
        for label_sequence in self.label_sequences:
            pi_counter[label_sequence[0]] += 1.0

        log_err('\tStart counting transitions...')
        # 2. Count all state transitions
        for label_sequence in self.label_sequences:
            for j in range(1, len(label_sequence)):
                transitions_counter[label_sequence[j-1]][label_sequence[j]] += 1.0

        log_err('\tStart counting emissions...')
        # 3. Count emissions for each label
        for i in range(self.training_sample_size):
            for j in range(len(self.observation_sequences[i])):
                symbol = self.feature_symbol_mapping[str(self.observation_sequences[i][j])]    # encode the feature vector into int
                emissions_counter[self.label_sequences[i][j]][symbol] += 1.0

        log_err('\tStart forming log probability...')
        # 4. Form log probability, by using Laplace correction to avoid zero probabilities
        if self.useLaplaceRule:
            for i in range(self.state_dim):
                pi_counter[i] += 1.0

                for j in range(self.state_dim):
                    transitions_counter[i][j] += 1.0
                for k in range(self.encoded_observation_dim):
                    emissions_counter[i][k] += 1.0

        pi_count = sum(pi_counter)
        transition_count = [sum(transition) for transition in transitions_counter]    #????
        emission_count = [sum(emission) for emission in emissions_counter]
        log_err('\tStart computing probability...')
        for i in range(len(pi_counter)):
            pi[i] = pi_counter[i] / pi_count
            log_pi[i] = math.log(pi_counter[i] / pi_count)
        for i in range(self.state_dim):
            for j in range(self.state_dim):
                transitions[i][j] = transitions_counter[i][j] / transition_count[i]
                log_transitions[i][j] = math.log(transitions_counter[i][j] / transition_count[i])
        for i in range(self.state_dim):
            for j in range(self.encoded_observation_dim):
                emissions[i][j] = emissions_counter[i][j] / emission_count[i]
                log_emissions[i][j] = math.log(emissions_counter[i][j] / emission_count[i])

        self.pi = pi
        self.transitions = transitions
        self.emissions = emissions
        self.log_pi = log_pi
        self.log_transitions = log_transitions
        self.log_emissions = log_emissions
Exemple #25
0
import numpy as np
import hmm_faster
from math import log
from utils import get_binary_vector, log_err
from training_set_generator import get_training_samples_BW

FEATURE_DIM = 20  # ???? get rid of one of the feature which is
FEATURE_SPACE = 2**FEATURE_DIM

MAX_ITER = 10
MIN_IMPROVMENT = 0.1

# 1. Read in the data
log_err('Reading data from retrieval...')
# data = get_training_samples_BW('http://scholar.google.com/citations?user=YU-baPIAAAAJ&hl=en', True)
data = get_training_samples_BW(
    'http://scholar.google.com/citations?user=x3LTjz0AAAAJ&hl=en', True)

# 2. Create binary-decimal mapper
log_err('Start generating binary vectors...')
feature_symbol_mapping = {}
possible_features = get_binary_vector(FEATURE_SPACE, FEATURE_DIM)
log_err('Start mapping feature vectors to int encoding...')
for possible_feature in possible_features:
    feature_symbol_mapping[str(possible_feature)] = len(feature_symbol_mapping)
log_err('Finish mapping...')

# 3. Map the features from bianry to decimal
log_err('Converting the features')
encoded_author_sequences = []
encoded_title_sequences = []
Exemple #26
0
          = np.array([z1_uv]), np.array([z2_uv]), np.array([mst_uv]), np.array([msterr_uv]),\
            np.array([phi_uv]), np.array([phierr_uv]), np.array([alp_uv]), np.array([alperr_uv]), np.array([ppr_n])
 else:
     lngth = len(mst_uv)
 #
 print('-------------------------------------------------------------')
 print('Working on: ' + ppr_n[0])
 print('-------------------------------------------------------------')
 #
 # Calculating SFRD
 #
 sfrd_uv = np.zeros(len(z1_uv))
 sfrd_uv_err = np.zeros(len(z1_uv))
 for j in range(len(z1_uv)):
     # Computing parameters array
     logphi, logphi_err = utl.log_err(phi_uv[j], phierr_uv[j])
     mean_all = np.array([mst_uv[j], logphi, alp_uv[j]])
     err_all = np.array([msterr_uv[j], logphi_err, alperr_uv[j]])
     zcen = (z1_uv[j] + z2_uv[j]) / 2
     #lst11 = utl.m_to_l_wave(mean_all[0], 1500)
     lt1 = 0.00001 / kap_uv
     sfr2, sfr2e = cov.sfrd_w_err(lum=lums_all,
                                  z=zcen,
                                  mean2=mean_all,
                                  err2=err_all,
                                  kappa=kap_uv,
                                  limit=lt1)
     sfrd_uv[j], sfrd_uv_err[j] = sfr2, sfr2e
     f22.write(ppr_n[0] + '\t' + str(z1_uv[j]) + '\t' + str(z2_uv[j]) +
               '\t' + str(sfr2) + '\t' + str(sfr2e) + '\n')
 #
Exemple #27
0
    def run_with_boosting_features(self):
        i = 0
        self.new_labels = []
        self.combined_labels = []

        for raw_segment, label_sequence in zip(self.raw_segments, self.label_sequences):
            feature_vectors, new_labels = self.hmm_new.decode(raw_segment, True, True, self.token_BGM, self.pattern_BGM)
            self.new_labels.append(new_labels)
            tokens = Tokens(raw_segment).tokens
            print i, ':  ', raw_segment

            # Combination step: 
            tmp_combined_labels = []    # the decided combined labels so far
            for token, old_label, new_label, feature_vector in zip(tokens, label_sequence, new_labels, feature_vectors):

                # Combine old and new labels to come out a combined label, and deciding...
                combined_label = -1
                
                if old_label == new_label:
                    combined_label = new_label
                    tmp_combined_labels.append(new_label)
                
                # Combine compatible labels: FN and LN
                elif old_label in [0,1] and new_label in [0,1]:
                    combined_label = old_label
                    tmp_combined_labels.append(new_label)
                
                # Combine labels that are not compatible
                else:   
                    tmp_feature_entity = self.hmm_new.feature_entity_list.lookup(feature_vector)    # Get the Background knowledge provided the feature vector: the language feature model
                    sorted_label_distribution = sorted(tmp_feature_entity.label_distribution.iteritems(), key=operator.itemgetter(1), reverse=True)
                    total_label_occurence = float(sum(tmp[1] for tmp in sorted_label_distribution))

                    

                    # ============================================================================================
                    # ============================================================================================
                    # ???? Experimenting: removing the low prob label distribution; FAILURE; ARCHIVED HERE AND DEPRECATED 
                    # sorted_label_distribution = []
                    # sum_prob = 0.0
                    # for pair in tmp_sorted_label_distribution:
                    #     sorted_label_distribution.append(pair)
                    #     sum_prob += pair[1]
                    #     if sum_prob/total_label_occurence >= 0.90:
                    #         break
                    # ============================================================================================
                    # ============================================================================================



                    # Dominant label case: Iterate from the highest label stats according to this feature vector:
                    for label_frequency in sorted_label_distribution:
                        if int(label_frequency[0]) in [old_label, new_label] and (label_frequency[1]/total_label_occurence)>=self.DOMINANT_RATIO:
                            print 'Dominant labels'
                            # Check for constraint:
                            tmp_label_to_check = int(label_frequency[0])
                            
                            # Find last occurence position of this label
                            if tmp_label_to_check not in [0,1]:
                                last_occurence = ''.join([str(c) for c in tmp_combined_labels]).rfind(str(tmp_label_to_check))
                            elif tmp_label_to_check in [0,1]:
                                last_occurence_0 = ''.join([str(c) for c in tmp_combined_labels]).rfind('0')
                                last_occurence_1 = ''.join([str(c) for c in tmp_combined_labels]).rfind('1')
                                last_occurence = max(last_occurence_0, last_occurence_1)

                            # Checking constraints by simplifying what we did in viterbi
                            if last_occurence == -1 or last_occurence == (len(tmp_combined_labels)-1):  # Never occurred, or last occurence is the last label
                                # When we are deciding the first label
                                if len(tmp_combined_labels) == 0:
                                    first_bit = self.find_majority_structure()[0]
                                    if first_bit == 0 and tmp_label_to_check not in [0,1]:
                                        continue
                                    if first_bit == 3 and tmp_label_to_check != 3:
                                        continue

                                # VN CANNOT FOLLOW TI W/O DL constraint
                                if tmp_label_to_check == 4 and tmp_combined_labels[-1] == 3:
                                    continue
                            elif tmp_label_to_check in [0,1]:
                                flag = False
                                for j in range(last_occurence, len(tmp_combined_labels)):
                                    if tmp_combined_labels[j] not in [0,1,2]:
                                        flag = True
                                        break
                                if flag:
                                    continue
                            elif tmp_label_to_check == 3:
                                continue
                            elif tmp_label_to_check == 4:
                                if tmp_combined_labels[-1] == 3:    #????
                                    continue

                            combined_label = tmp_label_to_check
                            tmp_combined_labels.append(tmp_label_to_check)
                            break
                    
                    # No dominance case OR Dominance-fail-due-to-constraint case: Find relatively if the label with higher possibility follow the constraint of publication order
                    if combined_label == -1:
                        # Iterate from the highest label stats according to this feature vector:

                        for label_frequency in sorted_label_distribution:
                            breakout_flag = False
                            #Test against constraints
                            # 1. DL separate labels principle
                            # 2. AU-TI-VN Order 
                            if int(label_frequency[0]) in [old_label, new_label]:
                                tmp_label_to_check = int(label_frequency[0])
                                
                                # find structure of the order, and find what have appeared, and so predict what to be appear next
                                structure_overview = []     #will record the order in big sense: 0,3,4/4,0,3
                                for tmp_combined_label in tmp_combined_labels:
                                    if tmp_combined_label in [2,5]:
                                        continue                                            
                                    elif tmp_combined_label in [0,1]:
                                        if 0 in structure_overview:
                                            continue
                                        else:
                                            structure_overview.append(0)
                                    elif tmp_combined_label == 3:
                                        if 3 in structure_overview:
                                            continue
                                        else:
                                            structure_overview.append(3)
                                    elif tmp_combined_label == 4:
                                        if 4 in structure_overview:
                                            continue
                                        else:
                                            structure_overview.append(4)
                                # Based on the structure overview, find what should appear next
                                appear_next = []
                                if structure_overview == [0]:
                                    appear_next = [0,1,3,2,5]
                                elif structure_overview == [3]:
                                    appear_next = [3,0,1,2,5]
                                elif structure_overview == [0,3]:
                                    appear_next = [3,4,2,5]
                                elif structure_overview == [3,0]:
                                    appear_next = [0,1,4,2,5]
                                elif structure_overview == [0,3,4]:
                                    appear_next = [4,2,5]
                                elif structure_overview == [3,0,4]:
                                    appear_next = [4,2,5]
                                else:   #weird case
                                    print 'Weird structure! Weird case!'
                                    if tmp_feature_entity.label_distribution[str(old_label)] > tmp_feature_entity.label_distribution[str(new_label)]:
                                        tmp_label_to_check_list = [old_label, new_label]
                                    else:
                                        tmp_label_to_check_list = [new_label, old_label]
                                    # Apply constraints here too
                                    for tmp_label_to_check in tmp_label_to_check_list:
                                        if tmp_label_to_check not in [0,1]:
                                            last_occurence = ''.join([str(c) for c in tmp_combined_labels]).rfind(str(tmp_label_to_check))
                                        elif tmp_label_to_check in [0,1]:
                                            last_occurence_0 = ''.join([str(c) for c in tmp_combined_labels]).rfind('0')
                                            last_occurence_1 = ''.join([str(c) for c in tmp_combined_labels]).rfind('1')
                                            last_occurence = max(last_occurence_0, last_occurence_1)

                                        # Checking constraints by simplifying what we did in viterbi
                                        if last_occurence == -1 or last_occurence == (len(tmp_combined_labels)-1):
                                            # When we are deciding the first label
                                            if len(tmp_combined_labels) == 0:
                                                first_bit = self.find_majority_structure()[0]
                                                if first_bit == 0 and tmp_label_to_check not in [0,1]:
                                                    continue
                                                if first_bit == 3 and tmp_label_to_check != 3:
                                                    continue
                                            try:
                                                if tmp_label_to_check == 4 and tmp_combined_labels[-1] == 3:
                                                    continue
                                            except:
                                                continue
                                        elif tmp_label_to_check in [0,1]:
                                            flag = False
                                            for j in range(last_occurence, len(tmp_combined_labels)):
                                                if tmp_combined_labels[j] not in [0,1,2]:
                                                    flag = True
                                                    break
                                            if flag:
                                                continue
                                        elif tmp_label_to_check == 3:
                                            continue
                                        elif tmp_label_to_check == 4:
                                            if tmp_combined_labels[-1] == 3:
                                                continue

                                        combined_label = tmp_label_to_check
                                        tmp_combined_labels.append(combined_label)
                                        breakout_flag = True
                                        break

                                if breakout_flag:
                                    break
                                if tmp_label_to_check in appear_next:
                                    # Then check constraint. find last occurence, DL constraints
                                    # Just need to check DL constraints, no need to verify more on tokens, assume token verification is done in the first iteration
                                    if tmp_label_to_check not in [0,1]:
                                        last_occurence = ''.join([str(c) for c in tmp_combined_labels]).rfind(str(tmp_label_to_check))
                                    elif tmp_label_to_check in [0,1]:
                                        last_occurence_0 = ''.join([str(c) for c in tmp_combined_labels]).rfind('0')
                                        last_occurence_1 = ''.join([str(c) for c in tmp_combined_labels]).rfind('1')
                                        last_occurence = max(last_occurence_0, last_occurence_1)

                                    # Checking constraints by simplifying what we did in viterbi
                                    if last_occurence == -1 or last_occurence == (len(tmp_combined_labels)-1):
                                        if tmp_label_to_check == 4 and tmp_combined_labels[-1] == 3: #Hardcode rule [2013/07/23]: For VN, cannot directly follow a TI without DL???? may remove on real effect
                                            continue
                                    elif tmp_label_to_check in [0,1]:
                                        flag = False
                                        for j in range(last_occurence, len(tmp_combined_labels)):
                                            if tmp_combined_labels[j] not in [0,1,2]:
                                                flag = True
                                                break
                                        if flag:
                                            continue

                                    elif tmp_label_to_check == 3:
                                        continue
                                        # flag = False
                                        # for j in range(last_occurence, len(tmp_combined_labels)):
                                        #     if tmp_combined_labels[j] not in [3,2]:
                                        #         flag = True
                                        #         break
                                        # if flag:
                                        #     continue

                                    elif tmp_label_to_check == 4:
                                        if tmp_combined_labels[-1] == 3:    #????
                                            continue

                                    # elif tmp_label_to_check == 2:
                                    # elif tmp_label_to_check == 5:
                                    
                                    # Otherwise, pass
                                    log_err('\t\t' + str(i) + 'Should combine this one')
                                    combined_label = tmp_label_to_check
                                    tmp_combined_labels.append(tmp_label_to_check)
                                    # combined_label = (tmp_label_to_check, sorted_label_distribution)
                                    break
                                    
                                else:
                                    continue

                        # Debug
                        if combined_label == -1:
                            log_err(str(i) + 'problem')
                            combined_label = (appear_next, sorted_label_distribution)
                            tmp_combined_labels.append(-1)


            # Final check the accordance with the major order, ideally, all records under one domain should have the same order... PS very ugly code I admit
            print '==========================tmp_combined_labels', tmp_combined_labels
            majority_order_structure = self.find_majority_structure()[1]
            majority_rate = self.find_majority_structure()[2]
            tmp_combined_labels_length = len(tmp_combined_labels)
            if majority_rate > 0.80 and majority_order_structure == [0,3,4]:
                # p1(phase1): author segments
                for p1 in range(tmp_combined_labels_length):
                    if tmp_combined_labels[p1] in [0,1,2,5]:
                        continue
                    else:
                        break

                # p2(phase2): title segments
                for p2 in range(p1, tmp_combined_labels_length):
                    if tmp_combined_labels[p2] == 3:
                        continue
                    else:
                        break

                #p3(phase3): venue segments
                for p3 in range(p2, tmp_combined_labels_length):
                    if tmp_combined_labels[p3] in [2,5,4]:
                        continue
                    else:
                        break

                # Decision
                if p1 == 0:
                    print 'Houston we got a SERIOUS problem!'
                    log_err('Houston we got a SERIOUS problem!!!!!!!!')

                if p2 == p1:
                    print 'Houston we got a problem!'
                    for sp2 in range(p2, tmp_combined_labels_length):
                        if tmp_combined_labels[sp2] != 2:
                            tmp_combined_labels[sp2] = 3
                        else:
                            break   # should fix common mislabeling at this point now??????????


            # elif majority_rate > 0.80 and majority_order_structure == [3,0,4]:    # ???? not sure if this is normal
            #     # p1(phase1): title segments
            #     for p1 in range(tmp_combined_labels_length):
            #         if tmp_combined_labels[p1] in [3]:
            #             continue
            #         else:
            #             break

            #     # p2(phase2): author segments
            #     for p2 in range(p1, tmp_combined_labels_length):
            #         if tmp_combined_labels[p2] == 3:
            #             continue
            #         else:
            #             break

            #     #p3(phase3): venue segments
            #     for p3 in range(p2, tmp_combined_labels_length):
            #         if tmp_combined_labels[p3] in [2,5,4]:
            #             continue
            #         else:
            #             break

            #     # Decision
            #     if p1 == 0:
            #         print 'Houston we got a SERIOUS problem!'
            #         log_err('Houston we got a SERIOUS problem!!!!!!!!')

            #     if p2 == p1:
            #         print 'Houston we got a problem!'
            #         for sp2 in range(p2, tmp_combined_labels_length):
            #             if tmp_combined_labels[sp2] != 2:
            #                 tmp_combined_labels[sp2] = 3
            #             else:
            #                 break
            for old_label, new_label, tmp_combined_label, token, feature_vector in zip(label_sequence, new_labels, tmp_combined_labels, tokens, feature_vectors):
                print to_label(old_label), '\t', to_label(new_label), '\t', to_label(tmp_combined_label), '\t', token, '\t', feature_vector
            print '\n'
            i+=1
Exemple #28
0
import pickle
import ghmm
from utils import get_binary_vector, log_err
from ghmm import *

FEATURE_DIM = 20  # ???? get rid of one of the feature which is
FEATURE_SPACE = 2**FEATURE_DIM

MAX_ITER = 10
MIN_IMPROVMENT = 0.1

# 1. Read in the data from Mac
log_err('Reading data from pickle...')
fp = open('data.pkl', 'rb')
data = pickle.load(fp)
fp.close()

# 2. Create binary-decimal mapper
log_err('Start generating binary vectors...')
feature_symbol_mapping = {}
possible_features = get_binary_vector(FEATURE_SPACE, FEATURE_DIM)
log_err('Start mapping feature vectors to int encoding...')
for possible_feature in possible_features:
    feature_symbol_mapping[str(possible_feature)] = len(feature_symbol_mapping)
log_err('Finish mapping...')

# 3. Map the features from bianry to decimal
log_err('Converting the features')
encoded_author_sequences = []
encoded_title_sequences = []
encoded_venue_sequences = []
Exemple #29
0
    def start_receiving(self):
        try:
            while self.listen:
                try:
                    data = self.channel.recv(
                        Client.RECEIVE_MSG_BUFFER_SIZE).decode('utf-8')
                    try:
                        json_data = json.loads(data)

                        # check for valid player connection package
                        conn_dict = players.PlayerConnectionPackage.__dict__
                        is_valid_connection_pkg = True
                        for key in json_data:
                            if key not in conn_dict:
                                is_valid_connection_pkg = False
                        if is_valid_connection_pkg:
                            # if the received package is a valid connection package,
                            # read it and save it's contents to the current player
                            # and set the player_is_ready flag to true!
                            if self.player_is_ready:
                                utils.log_wrn(
                                    "Received connection package from client '"
                                    + self.get_key() +
                                    "', but client already connected! Ignoring old package..."
                                )
                                self.player = players.Player(json_data)
                            else:
                                utils.log(
                                    "Received connection package from client '"
                                    + self.get_key() + "'!")
                                self.player = players.Player(json_data)
                            self.player_is_ready = True
                        else:
                            # check for valid player input package
                            input_dict = players.PlayerInputPackage.__dict__
                            is_valid_input_pkg = True
                            for key in json_data:
                                if key not in input_dict:
                                    is_valid_input_pkg = False
                            if is_valid_input_pkg:
                                utils.log_wrn("Debug - VALID INPUT PACKAGE!")
                            else:
                                utils.log_wrn(
                                    "Could not interpret client package...")
                                raise ValueError
                    except ValueError:
                        # ValueError here means the package received was not a JSON object,
                        # but rather a regular text message, used for events
                        utils.log("Got message from client '" +
                                  self.get_key() + "':\n" + data)

                except (ConnectionResetError, ConnectionAbortedError,
                        ConnectionRefusedError, ConnectionError):
                    self.close()
                    utils.log("Connection to client '" + self.get_key() +
                              "' has been closed.")

        except socket.error:
            utils.log_err("Message receiver thread stopped for client '" +
                          self.get_key() + "'!")
            self.close()
def get_training_samples(url):
    log_err('\tGetting Training sample')
    raw_results = router(url)
    log_err('\tData retrieved. Preprocessing...')
    observation_list = []
    label_list = []
    records = []

    feature_generator = FeatureGenerator()
    token_generator = Tokens()

    for raw_result in raw_results:
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []

        authors = raw_result['authors']
        title = raw_result['title']        
        title_copy = raw_result['title']

        try:
            venue = raw_result['conference name']
            venue_copy = raw_result['conference name']
        except:
            venue = ''
            venue_copy = ''
        try:
            venue = raw_result['journal name']
            venue_copy = raw_result['journal name']
        except:
            venue = ''
            venue_copy = ''

        if len(venue) > 0:
            try:
                volume = raw_result['volume']
            except:
                volume = ''
            try:
                issue = raw_result['issue']
            except:
                issue = ''
            try:
                page = raw_result['page']
            except:
                page = ''

            venue += ' ' + volume + ' ' + issue + ' ' + page
            venue_copy += ' ' + volume + ' ' + issue + ' ' + page


        date = raw_result['publication date'][:4]

        # FN: 0
        # LN: 1
        # DL: 2
        # TI: 3
        # VN: 4
        # DT: 5

        # Author -> Title -> ...
        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author += ' , '
            tmp_record += author
            tmp_label_list += [0] * (feature_generator.token_length(author)-2)
            tmp_label_list += [1,2]
                
        # title
        title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)

        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        # title
        # title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author += ' , '
            tmp_record += author
            tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
            tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))



        #=================================Variations of authors=================================
        # Changing order, inserting dot, and probably insert comma as delimiter inside of names
        # This part of variations is very sensitive to what sample source to choose from,
        # for example, Google scholar is the current source of samples, and on gscholar, 
        # most names are in format of JW Han.  <-- Prior knowledge
        # Read more Learn more Change the Globe !!!
        log_err('\tGenerating multiple cases for name variations... ')
        # ================================A. B
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        # authors
        for author in authors:
            if len(author) == 0:
                continue

            #???? BUG!!!! split() doesn't mean tokenization
            author_tokens = token_generator.tokenize(author)['tokens']  # Split the author in order tokens
            if len(author_tokens) == 1:     # Cannot change order or anything, so leave this name alone, and pass to the next name
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Insert dot
                author = author_tokens[0] + '.' + author_tokens[1] + ' , '  # A. B
                tmp_token_length = token_generator.token_length(author)
                tmp_record += author
                tmp_label_list += [0]*(tmp_token_length-2) + [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        # title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']  # Split the author in order to
            if len(author_tokens) == 1:     # Cannot change order or anything, so leave this name alone, and pass to the next name
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Insert dot
                author = author_tokens[0] + '.' + author_tokens[1] + ' , '  # A. B
                tmp_token_length = token_generator.token_length(author)
                tmp_record += author
                tmp_label_list += [0]*(tmp_token_length-2) + [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # ================================B, 
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Only keep lastname
                author = author_tokens[1] + ' , '  # B
                tmp_record += author
                tmp_label_list += [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        # title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Only keep lastname
                author = author_tokens[1] + ' , '  # B
                tmp_record += author
                tmp_label_list += [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))



        # ================================B A., 
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '.,'  # B A.,
                tmp_record += author
                tmp_label_list += [1,0,0,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        # title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '.,'  # B A.,
                tmp_record += author
                tmp_label_list += [1,0,0,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # ================================B A.
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '. '  # B A.
                tmp_record += author
                tmp_label_list += [1,0,0]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        # title += ' , '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '. '  # B A.
                tmp_record += author
                tmp_label_list += [1,0,0]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            # venue += ' , '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))






        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        #============================================================================================================================================
        # Period Case!!!
        log_err('\tGenerating multiple cases for period as DL... ')
        # Author -> Title -> ...
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author += ' , '
            tmp_record += author
            tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
            tmp_label_list += [1,2]
                
        # title
        title = title_copy + ' . '
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            venue = venue_copy + ' . '
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))



        # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author += ' , '
            tmp_record += author
            tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
            tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # ================================A. B
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        # authors
        for author in authors:
            if len(author) == 0:
                continue

            author_tokens = token_generator.tokenize(author)['tokens']  # Split the author in order tokens
            if len(author_tokens) == 1:     # Cannot change order or anything, so leave this name alone, and pass to the next name
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Insert dot
                author = author_tokens[0] + '.' + author_tokens[1] + ' , '  # A. B
                tmp_token_length = token_generator.token_length(author)
                tmp_record += author
                tmp_label_list += [0]*(tmp_token_length-2) + [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']  # Split the author in order to
            if len(author_tokens) == 1:     # Cannot change order or anything, so leave this name alone, and pass to the next name
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Insert dot
                author = author_tokens[0] + '.' + author_tokens[1] + ' , '  # A. B
                tmp_token_length = token_generator.token_length(author)
                tmp_record += author
                tmp_label_list += [0]*(tmp_token_length-2) + [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # ================================B, 
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Only keep lastname
                author = author_tokens[1] + ' , '  # B
                tmp_record += author
                tmp_label_list += [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Only keep lastname
                author = author_tokens[1] + ' , '  # B
                tmp_record += author
                tmp_label_list += [1,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))



        # ================================B A., 
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '.,'  # B A.,
                tmp_record += author
                tmp_label_list += [1,0,0,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '.,'  # B A.,
                tmp_record += author
                tmp_label_list += [1,0,0,2]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

       
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # ================================B A.
        # authors
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '. '  # B A.
                tmp_record += author
                tmp_label_list += [1,0,0]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # title
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))


        # Title -> Author -> ...
        tmp_record = ''
        tmp_observation_list = []
        tmp_label_list = []
        tmp_record += title
        tmp_label_list += [3] * (feature_generator.token_length(title)-1)    #!!!!
        tmp_label_list += [2]

        # authors
        for author in authors:
            if len(author) == 0:
                continue
            author_tokens = token_generator.tokenize(author)['tokens']
            if len(author_tokens) == 1:     
                author += ' , '
                tmp_record += author
                tmp_label_list += [1,2]
            elif len(author_tokens) == 2:   # Change order and insert dot
                author = author_tokens[1] + ' ' + author_tokens[0] + '. '  # B A.
                tmp_record += author
                tmp_label_list += [1,0,0]
            else:                           # name contains more than two tokens, just leave it for now
                author += ' , '
                tmp_record += author
                tmp_label_list += [0] * (feature_generator.token_length(author)-2)    #!!!!
                tmp_label_list += [1,2]
                
        # venue
        if len(venue) > 0:
            tmp_record += venue
            tmp_label_list += [4] * (feature_generator.token_length(venue)-1)    #!!!!
            tmp_label_list += [2]

        # date
        if len(date) > 0:
            tmp_record += date
            tmp_label_list += [5] * feature_generator.token_length(date)    #!!!!

        
        # Aggregate and append
        label_list.append(tmp_label_list)
        records.append(tmp_record)
        observation_list.append(feature_generator.build(tmp_record))





    # =============================================================================Verbose: Print the training set
    for record, observation, label in zip(records, observation_list, label_list):
        for rr, oo, ll in zip(token_generator.tokenize(record)['tokens'], observation, label):
            if ll == 0:
                ll = 'FN'
            elif ll == 1:
                ll = 'LN'
            elif ll == 2:
                ll = 'DL'
            elif ll == 3:
                ll = 'TI'
            elif ll == 4:
                ll = 'VN'
            elif ll == 5:
                ll = 'DT'
            print oo, '\t', ll.encode('utf-8'), '\t', rr.encode('utf-8')
        print '\n\n'

    return observation_list, label_list
Exemple #31
0
    def run_with_boosting_features(self):
        i = 0
        self.new_labels = []
        self.combined_labels = []

        for raw_segment, label_sequence in zip(self.raw_segments,
                                               self.label_sequences):
            feature_vectors, new_labels = self.hmm_new.decode(
                raw_segment, True, True, self.token_BGM, self.pattern_BGM)
            self.new_labels.append(new_labels)
            tokens = Tokens(raw_segment).tokens
            print i, ':  ', raw_segment

            # Combination step:
            tmp_combined_labels = []  # the decided combined labels so far
            for token, old_label, new_label, feature_vector in zip(
                    tokens, label_sequence, new_labels, feature_vectors):

                # Combine old and new labels to come out a combined label, and deciding...
                combined_label = -1

                if old_label == new_label:
                    combined_label = new_label
                    tmp_combined_labels.append(new_label)

                # Combine compatible labels: FN and LN
                elif old_label in [0, 1] and new_label in [0, 1]:
                    combined_label = old_label
                    tmp_combined_labels.append(new_label)

                # Combine labels that are not compatible
                else:
                    tmp_feature_entity = self.hmm_new.feature_entity_list.lookup(
                        feature_vector
                    )  # Get the Background knowledge provided the feature vector: the language feature model
                    sorted_label_distribution = sorted(
                        tmp_feature_entity.label_distribution.iteritems(),
                        key=operator.itemgetter(1),
                        reverse=True)
                    total_label_occurence = float(
                        sum(tmp[1] for tmp in sorted_label_distribution))

                    # ============================================================================================
                    # ============================================================================================
                    # ???? Experimenting: removing the low prob label distribution; FAILURE; ARCHIVED HERE AND DEPRECATED
                    # sorted_label_distribution = []
                    # sum_prob = 0.0
                    # for pair in tmp_sorted_label_distribution:
                    #     sorted_label_distribution.append(pair)
                    #     sum_prob += pair[1]
                    #     if sum_prob/total_label_occurence >= 0.90:
                    #         break
                    # ============================================================================================
                    # ============================================================================================

                    # Dominant label case: Iterate from the highest label stats according to this feature vector:
                    for label_frequency in sorted_label_distribution:
                        if int(label_frequency[0]) in [
                                old_label, new_label
                        ] and (label_frequency[1] /
                               total_label_occurence) >= self.DOMINANT_RATIO:
                            print 'Dominant labels'
                            # Check for constraint:
                            tmp_label_to_check = int(label_frequency[0])

                            # Find last occurence position of this label
                            if tmp_label_to_check not in [0, 1]:
                                last_occurence = ''.join([
                                    str(c) for c in tmp_combined_labels
                                ]).rfind(str(tmp_label_to_check))
                            elif tmp_label_to_check in [0, 1]:
                                last_occurence_0 = ''.join([
                                    str(c) for c in tmp_combined_labels
                                ]).rfind('0')
                                last_occurence_1 = ''.join([
                                    str(c) for c in tmp_combined_labels
                                ]).rfind('1')
                                last_occurence = max(last_occurence_0,
                                                     last_occurence_1)

                            # Checking constraints by simplifying what we did in viterbi
                            if last_occurence == -1 or last_occurence == (
                                    len(tmp_combined_labels) - 1
                            ):  # Never occurred, or last occurence is the last label
                                # When we are deciding the first label
                                if len(tmp_combined_labels) == 0:
                                    first_bit = self.find_majority_structure(
                                    )[0]
                                    if first_bit == 0 and tmp_label_to_check not in [
                                            0, 1
                                    ]:
                                        continue
                                    if first_bit == 3 and tmp_label_to_check != 3:
                                        continue

                                # VN CANNOT FOLLOW TI W/O DL constraint
                                if tmp_label_to_check == 4 and tmp_combined_labels[
                                        -1] == 3:
                                    continue
                            elif tmp_label_to_check in [0, 1]:
                                flag = False
                                for j in range(last_occurence,
                                               len(tmp_combined_labels)):
                                    if tmp_combined_labels[j] not in [0, 1, 2]:
                                        flag = True
                                        break
                                if flag:
                                    continue
                            elif tmp_label_to_check == 3:
                                continue
                            elif tmp_label_to_check == 4:
                                if tmp_combined_labels[-1] == 3:  #????
                                    continue

                            combined_label = tmp_label_to_check
                            tmp_combined_labels.append(tmp_label_to_check)
                            break

                    # No dominance case OR Dominance-fail-due-to-constraint case: Find relatively if the label with higher possibility follow the constraint of publication order
                    if combined_label == -1:
                        # Iterate from the highest label stats according to this feature vector:

                        for label_frequency in sorted_label_distribution:
                            breakout_flag = False
                            #Test against constraints
                            # 1. DL separate labels principle
                            # 2. AU-TI-VN Order
                            if int(label_frequency[0]) in [
                                    old_label, new_label
                            ]:
                                tmp_label_to_check = int(label_frequency[0])

                                # find structure of the order, and find what have appeared, and so predict what to be appear next
                                structure_overview = [
                                ]  #will record the order in big sense: 0,3,4/4,0,3
                                for tmp_combined_label in tmp_combined_labels:
                                    if tmp_combined_label in [2, 5]:
                                        continue
                                    elif tmp_combined_label in [0, 1]:
                                        if 0 in structure_overview:
                                            continue
                                        else:
                                            structure_overview.append(0)
                                    elif tmp_combined_label == 3:
                                        if 3 in structure_overview:
                                            continue
                                        else:
                                            structure_overview.append(3)
                                    elif tmp_combined_label == 4:
                                        if 4 in structure_overview:
                                            continue
                                        else:
                                            structure_overview.append(4)
                                # Based on the structure overview, find what should appear next
                                appear_next = []
                                if structure_overview == [0]:
                                    appear_next = [0, 1, 3, 2, 5]
                                elif structure_overview == [3]:
                                    appear_next = [3, 0, 1, 2, 5]
                                elif structure_overview == [0, 3]:
                                    appear_next = [3, 4, 2, 5]
                                elif structure_overview == [3, 0]:
                                    appear_next = [0, 1, 4, 2, 5]
                                elif structure_overview == [0, 3, 4]:
                                    appear_next = [4, 2, 5]
                                elif structure_overview == [3, 0, 4]:
                                    appear_next = [4, 2, 5]
                                else:  #weird case
                                    print 'Weird structure! Weird case!'
                                    if tmp_feature_entity.label_distribution[str(
                                            old_label
                                    )] > tmp_feature_entity.label_distribution[
                                            str(new_label)]:
                                        tmp_label_to_check_list = [
                                            old_label, new_label
                                        ]
                                    else:
                                        tmp_label_to_check_list = [
                                            new_label, old_label
                                        ]
                                    # Apply constraints here too
                                    for tmp_label_to_check in tmp_label_to_check_list:
                                        if tmp_label_to_check not in [0, 1]:
                                            last_occurence = ''.join([
                                                str(c)
                                                for c in tmp_combined_labels
                                            ]).rfind(str(tmp_label_to_check))
                                        elif tmp_label_to_check in [0, 1]:
                                            last_occurence_0 = ''.join([
                                                str(c)
                                                for c in tmp_combined_labels
                                            ]).rfind('0')
                                            last_occurence_1 = ''.join([
                                                str(c)
                                                for c in tmp_combined_labels
                                            ]).rfind('1')
                                            last_occurence = max(
                                                last_occurence_0,
                                                last_occurence_1)

                                        # Checking constraints by simplifying what we did in viterbi
                                        if last_occurence == -1 or last_occurence == (
                                                len(tmp_combined_labels) - 1):
                                            # When we are deciding the first label
                                            if len(tmp_combined_labels) == 0:
                                                first_bit = self.find_majority_structure(
                                                )[0]
                                                if first_bit == 0 and tmp_label_to_check not in [
                                                        0, 1
                                                ]:
                                                    continue
                                                if first_bit == 3 and tmp_label_to_check != 3:
                                                    continue
                                            try:
                                                if tmp_label_to_check == 4 and tmp_combined_labels[
                                                        -1] == 3:
                                                    continue
                                            except:
                                                continue
                                        elif tmp_label_to_check in [0, 1]:
                                            flag = False
                                            for j in range(
                                                    last_occurence,
                                                    len(tmp_combined_labels)):
                                                if tmp_combined_labels[
                                                        j] not in [0, 1, 2]:
                                                    flag = True
                                                    break
                                            if flag:
                                                continue
                                        elif tmp_label_to_check == 3:
                                            continue
                                        elif tmp_label_to_check == 4:
                                            if tmp_combined_labels[-1] == 3:
                                                continue

                                        combined_label = tmp_label_to_check
                                        tmp_combined_labels.append(
                                            combined_label)
                                        breakout_flag = True
                                        break

                                if breakout_flag:
                                    break
                                if tmp_label_to_check in appear_next:
                                    # Then check constraint. find last occurence, DL constraints
                                    # Just need to check DL constraints, no need to verify more on tokens, assume token verification is done in the first iteration
                                    if tmp_label_to_check not in [0, 1]:
                                        last_occurence = ''.join([
                                            str(c) for c in tmp_combined_labels
                                        ]).rfind(str(tmp_label_to_check))
                                    elif tmp_label_to_check in [0, 1]:
                                        last_occurence_0 = ''.join([
                                            str(c) for c in tmp_combined_labels
                                        ]).rfind('0')
                                        last_occurence_1 = ''.join([
                                            str(c) for c in tmp_combined_labels
                                        ]).rfind('1')
                                        last_occurence = max(
                                            last_occurence_0, last_occurence_1)

                                    # Checking constraints by simplifying what we did in viterbi
                                    if last_occurence == -1 or last_occurence == (
                                            len(tmp_combined_labels) - 1):
                                        if tmp_label_to_check == 4 and tmp_combined_labels[
                                                -1] == 3:  #Hardcode rule [2013/07/23]: For VN, cannot directly follow a TI without DL???? may remove on real effect
                                            continue
                                    elif tmp_label_to_check in [0, 1]:
                                        flag = False
                                        for j in range(
                                                last_occurence,
                                                len(tmp_combined_labels)):
                                            if tmp_combined_labels[j] not in [
                                                    0, 1, 2
                                            ]:
                                                flag = True
                                                break
                                        if flag:
                                            continue

                                    elif tmp_label_to_check == 3:
                                        continue
                                        # flag = False
                                        # for j in range(last_occurence, len(tmp_combined_labels)):
                                        #     if tmp_combined_labels[j] not in [3,2]:
                                        #         flag = True
                                        #         break
                                        # if flag:
                                        #     continue

                                    elif tmp_label_to_check == 4:
                                        if tmp_combined_labels[-1] == 3:  #????
                                            continue

                                    # elif tmp_label_to_check == 2:
                                    # elif tmp_label_to_check == 5:

                                    # Otherwise, pass
                                    log_err('\t\t' + str(i) +
                                            'Should combine this one')
                                    combined_label = tmp_label_to_check
                                    tmp_combined_labels.append(
                                        tmp_label_to_check)
                                    # combined_label = (tmp_label_to_check, sorted_label_distribution)
                                    break

                                else:
                                    continue

                        # Debug
                        if combined_label == -1:
                            log_err(str(i) + 'problem')
                            combined_label = (appear_next,
                                              sorted_label_distribution)
                            tmp_combined_labels.append(-1)

            # Final check the accordance with the major order, ideally, all records under one domain should have the same order... PS very ugly code I admit
            print '==========================tmp_combined_labels', tmp_combined_labels
            majority_order_structure = self.find_majority_structure()[1]
            majority_rate = self.find_majority_structure()[2]
            tmp_combined_labels_length = len(tmp_combined_labels)
            if majority_rate > 0.80 and majority_order_structure == [0, 3, 4]:
                # p1(phase1): author segments
                for p1 in range(tmp_combined_labels_length):
                    if tmp_combined_labels[p1] in [0, 1, 2, 5]:
                        continue
                    else:
                        break

                # p2(phase2): title segments
                for p2 in range(p1, tmp_combined_labels_length):
                    if tmp_combined_labels[p2] == 3:
                        continue
                    else:
                        break

                #p3(phase3): venue segments
                for p3 in range(p2, tmp_combined_labels_length):
                    if tmp_combined_labels[p3] in [2, 5, 4]:
                        continue
                    else:
                        break

                # Decision
                if p1 == 0:
                    print 'Houston we got a SERIOUS problem!'
                    log_err('Houston we got a SERIOUS problem!!!!!!!!')

                if p2 == p1:
                    print 'Houston we got a problem!'
                    for sp2 in range(p2, tmp_combined_labels_length):
                        if tmp_combined_labels[sp2] != 2:
                            tmp_combined_labels[sp2] = 3
                        else:
                            break  # should fix common mislabeling at this point now??????????

            # elif majority_rate > 0.80 and majority_order_structure == [3,0,4]:    # ???? not sure if this is normal
            #     # p1(phase1): title segments
            #     for p1 in range(tmp_combined_labels_length):
            #         if tmp_combined_labels[p1] in [3]:
            #             continue
            #         else:
            #             break

            #     # p2(phase2): author segments
            #     for p2 in range(p1, tmp_combined_labels_length):
            #         if tmp_combined_labels[p2] == 3:
            #             continue
            #         else:
            #             break

            #     #p3(phase3): venue segments
            #     for p3 in range(p2, tmp_combined_labels_length):
            #         if tmp_combined_labels[p3] in [2,5,4]:
            #             continue
            #         else:
            #             break

            #     # Decision
            #     if p1 == 0:
            #         print 'Houston we got a SERIOUS problem!'
            #         log_err('Houston we got a SERIOUS problem!!!!!!!!')

            #     if p2 == p1:
            #         print 'Houston we got a problem!'
            #         for sp2 in range(p2, tmp_combined_labels_length):
            #             if tmp_combined_labels[sp2] != 2:
            #                 tmp_combined_labels[sp2] = 3
            #             else:
            #                 break
            for old_label, new_label, tmp_combined_label, token, feature_vector in zip(
                    label_sequence, new_labels, tmp_combined_labels, tokens,
                    feature_vectors):
                print to_label(old_label), '\t', to_label(
                    new_label), '\t', to_label(
                        tmp_combined_label), '\t', token, '\t', feature_vector
            print '\n'
            i += 1
Exemple #32
0
    def init(client=False):
        utils.log("Loading settings...")
        # check if settings file exists
        if os.path.isfile(Environment.SETTINGS_FILE_PATH):
            # exists - read it
            with open(Environment.SETTINGS_FILE_PATH) as settings_file:
                data = json.load(settings_file)
            # check whether all setting keys exist in the opened file
            bad = False
            try:
                for settings_key in Game.settings:
                    if settings_key not in data:
                        # if a key is missing, log it and recreate the default settings file
                        utils.log_wrn("Missing settings key '" + settings_key +
                                      "'! Reverting to default settings...")
                        json_data = json.dumps(Game.settings,
                                               indent=4,
                                               separators=(',', ':'))
                        dump_file = open(Environment.SETTINGS_FILE_PATH, "w")
                        dump_file.write(json_data)
                        dump_file.close()
                        bad = True
                        break
            except ValueError:
                utils.log_err("Failed to read settings file; bad JSON format")
            # set settings
            if not bad: Game.settings = data

        else:
            # does not exist - create it
            utils.log("No settings file, creating new file with defaults...")
            json_data = json.dumps(Game.settings,
                                   indent=4,
                                   separators=(',', ':'))
            dump_file = open(Environment.SETTINGS_FILE_PATH, "w")
            dump_file.write(json_data)
            dump_file.close()
            utils.log_err("Failed to create default settings file!")

        utils.log("Loading bindings...")
        # check if bindings file exists
        if os.path.isfile(Environment.BINDINGS_FILE_PATH):
            # exists - read it
            with open(Environment.BINDINGS_FILE_PATH) as bindings_file:
                data = json.load(bindings_file)
            # check whether all setting keys exist in the opened file
            bad = False
            try:
                for binding_key in Game.bindings:
                    if binding_key not in data:
                        # if a key is missing, log it and recreate the default settings file
                        utils.log_wrn("Missing binding key '" + binding_key +
                                      "'! Reverting to default settings...")
                        json_data = json.dumps(Game.bindings,
                                               indent=4,
                                               separators=(',', ':'))
                        dump_file = open(Environment.BINDINGS_FILE_PATH, "w")
                        dump_file.write(json_data)
                        dump_file.close()
                        bad = True
                        break
            except ValueError:
                utils.log_err("Failed to read bindings file; bad JSON format")
            # set settings
            if not bad: Game.bindings = data

        else:
            # does not exist - create it
            utils.log("No bindings file, creating new file with defaults...")
            json_data = json.dumps(Game.bindings,
                                   indent=4,
                                   separators=(',', ':'))
            dump_file = open(Environment.BINDINGS_FILE_PATH, "w")
            dump_file.write(json_data)
            dump_file.close()
            utils.log_err("Failed to create default bindings file!")

        # if the client flag is set, do not initialize pygame
        if not client:
            utils.log("Loading player info...")
            # check if player info file exists
            if os.path.isfile(Environment.PLAYER_INFO_FILE_PATH):
                # exists - read it
                with open(
                        Environment.PLAYER_INFO_FILE_PATH) as player_info_file:
                    data = json.load(player_info_file)
                # check whether all player info keys exist in the opened file
                bad = False
                try:
                    for binding_key in Game.player_info:
                        if binding_key not in data:
                            # if a key is missing, log it and recreate the default settings file
                            utils.log_wrn(
                                "Missing player info key '" + binding_key +
                                "'! Reverting to default settings...")
                            json_data = json.dumps(Game.player_info,
                                                   indent=4,
                                                   separators=(',', ':'))
                            dump_file = open(Environment.PLAYER_INFO_FILE_PATH,
                                             "w")
                            dump_file.write(json_data)
                            dump_file.close()
                            bad = True
                            break
                except ValueError:
                    utils.log_err(
                        "Failed to read player info file; bad JSON format")
                # set settings
                if not bad: Game.player_info = data

            else:
                # does not exist - create it
                utils.log(
                    "No player info file, creating new file with defaults...")
                json_data = json.dumps(Game.player_info,
                                       indent=4,
                                       separators=(',', ':'))
                dump_file = open(Environment.PLAYER_INFO_FILE_PATH, "w")
                dump_file.write(json_data)
                dump_file.close()
                utils.log_err("Failed to create default player info file!")

            # initialize pygame
            utils.log("Initializing PyGame library...")
            pygame.mixer.pre_init(44100, 16, 2, 4096)
            pygame.init()
            Game.screen = pygame.display.set_mode(Game.settings["screen_size"])
            pygame.display.set_caption(Game.settings["title"])
            pygame.display.set_icon(pygame.image.load(Game.settings["icon"]))
            utils.log("PyGame library initialized!")