Beispiel #1
0
 def __init__(self, fs, name, addr, opts):
     self.addr = addr
     self.jt_addr = opts["jt_addr"]
     self.jt = ServerProxy(self.jt_addr)
     self.hb_timeout = 0.2  # heartbeat timeout in seconds
     self.on = True
     self.mapper = Mapper(opts, fs, "map" + name, addr)
     self.reducer = Reducer(fs, "reduce" + name, addr, opts,
                            RPCMapperClient())
Beispiel #2
0
    def createReducer(self):
        reducer = Reducer(5003 + (self.num - 1) * 10, self.ip)
        reducer.logging(False)
        reducer.log('Starting Up')

        # execution code goes here
        reducer.listen()

        # exiting
        reducer.log('Exiting')
Beispiel #3
0
def main():
    initialize_log()
    logging.info("Starting reducer.")
    config_params = parse_config_params()
    reducer = Reducer(config_params['aggregated_data_queue'],
                      config_params['sink_queue'],
                      config_params['aggregators_quantity'],
                      config_params['unflatten_key'],
                      config_params['unflatten_value_key'])
    reducer.start()
Beispiel #4
0
    def _parse(toklist):
        """ Parse a token list as a query """

        # Parse with the nonterminal 'QueryRoot' as the grammar root
        with Fast_Parser(verbose=False, root=_QUERY_ROOT) as bp:

            sent_begin = 0
            num_sent = 0
            num_parsed_sent = 0
            rdc = Reducer(bp.grammar)
            trees = dict()
            sent = []

            for ix, t in enumerate(toklist):
                if t[0] == TOK.S_BEGIN:
                    sent = []
                    sent_begin = ix
                elif t[0] == TOK.S_END:
                    slen = len(sent)
                    if not slen:
                        continue
                    num_sent += 1
                    # Parse the accumulated sentence
                    num = 0
                    try:
                        # Parse the sentence
                        forest = bp.go(sent)
                        if forest is not None:
                            num = Fast_Parser.num_combinations(forest)
                            if num > 1:
                                # Reduce the resulting forest
                                forest = rdc.go(forest)
                    except ParseError as e:
                        forest = None
                    if num > 0:
                        num_parsed_sent += 1
                        # Obtain a text representation of the parse tree
                        trees[num_sent] = ParseForestDumper.dump_forest(forest)
                        #ParseForestPrinter.print_forest(forest)

                elif t[0] == TOK.P_BEGIN:
                    pass
                elif t[0] == TOK.P_END:
                    pass
                else:
                    sent.append(t)

        result = dict(num_sent=num_sent, num_parsed_sent=num_parsed_sent)
        return result, trees
Beispiel #5
0
class Worker:
    def __init__(self, fs, name, addr, opts):
        self.addr = addr
        self.jt_addr = opts["jt_addr"]
        self.jt = ServerProxy(self.jt_addr)
        self.hb_timeout = 0.2  # heartbeat timeout in seconds
        self.on = True
        self.mapper = Mapper(opts, fs, "map" + name, addr)
        self.reducer = Reducer(fs, "reduce" + name, addr, opts,
                               RPCMapperClient())

    def start(self):
        print('Init worker')
        print('Start sending heartbeats to', self.jt_addr)
        _thread.start_new_thread(self._heartbeat, ())
        print('Server is ready')

    def _heartbeat(self):
        while self.on:
            try:
                self.jt.heartbeat(self.addr)
            except Exception as e:
                print(e)
            time.sleep(self.hb_timeout)

    # map data by applying some data function
    # task_id - unique task_id
    # reducers_count - number of reducers for the task
    # chunk_path - DFS path to the chunk file to map
    # map_script - DFS path to script of map function
    # restart_task - if True then restart map task even its already completed or executing now
    def map(self,
            task_id,
            rds_count,
            chunk_path,
            map_script,
            restart_task=False):
        return self.mapper.map(task_id, rds_count, chunk_path, map_script,
                               restart_task)

    # get status of task execution for the current task
    def get_status(self, task_id, chunk_path):
        return self.mapper.get_status(task_id, chunk_path)

    # read mapped data for specific region
    # task_id - unique task_id
    # region - is a integer region which is specified for the current reducer
    # Return dict {status: Status.ok, data: list of tuples}
    # if file not exists then status = Status.not_found
    # if file is empty then returns ok and empty list
    def read_mapped_data(self, task_id, region_number):
        return self.mapper.read_mapped_data(task_id, region_number)

    # signal from JT for starting reducing
    # task_id - unique task_id
    # region for which reducer is responsible
    # mappers which contain data for current task
    # path in DFS to files
    def reduce(self, task_id, region, mappers, script_path):
        return self.reducer.reduce(task_id, region, mappers, script_path)
Beispiel #6
0
def main():
    """
    This main() implements the following process and print out the number of shared tokens from two txt files

    @ outputs sorted tokenized results from txt1
    @ outputs sorted tokenized results from txt2

    @ merges these two sorted results and print out the number of shared tokens
    @ outputs the merged results
    """
    assert not os.path.exists(
        sys.argv[3]), "ERROR: merged output file already exists!"

    temp1, temp2 = ".f1_temp", ".f2_temp"

    t1, t2 = open(temp1, 'w'), open(temp2, 'w')

    f1, f2 = open(sys.argv[1], 'r'), open(sys.argv[2], 'r')

    wc1, wc2 = WordCount(f1), WordCount(f2)

    wc1.output_to(t1)
    wc2.output_to(t2)

    t1.close()
    t2.close()

    f1.close()
    f2.close()

    t1, t2 = open(temp1, 'r'), open(temp2, 'r')

    ofile = open(sys.argv[3], 'a')

    r = Reducer(t1, t2)

    num_of_same_token = r.merge_to(ofile, count_same_token=True)

    print "# of common tokens: ", num_of_same_token

    t1.close()
    t2.close()
    ofile.close()

    subprocess.call(["rm", ".f1_temp", ".f2_temp"])

    return 0
Beispiel #7
0
    def run(self, Mapper, Reducer, data):
        #map
        mapper = Mapper()
        tuples = mapper.map(data)

        #combine
        combined = {}
        for k, v in tuples:
            if k not in combined:
                combined[k] = []
            combined[k].append(v)

        #reduce
        reducer = Reducer()
        output = reducer.reduce(combined)

        for line in output:
            print(line)
Beispiel #8
0
    def __init__(self):
        """ Create singleton instance """
        # Check whether we already have an instance
        if ReductionSingleton.__instance is None:
            # Create and remember instance
            ReductionSingleton.__instance = Reducer()

        # Store instance reference as the only member in the handle
        self.__dict__[
            '_ReductionSingleton__instance'] = ReductionSingleton.__instance
Beispiel #9
0
 def __init__(self, parser, toklist, verbose = False):
     self._parser = parser
     self._reducer = Reducer(parser.grammar)
     self._num_sent = 0
     self._num_parsed_sent = 0
     self._num_tokens = 0
     self._total_ambig = 0.0
     self._total_tokens = 0
     self._start_time = time.time()
     self._verbose = verbose
     self._toklist = toklist
Beispiel #10
0
def main():
    if not os.path.exists('.data/items.json'):
        raise Exception(
            'The items.json file in the .data folder does not exist. Please run the scraping script before executing this script.'
        )

    collection_name = 'rsbuddy'
    client = MongoClient('localhost', 27017)
    database = client[collection_name]

    items = list(read_json('.data/items.json').items())
    indexes = np.array_split(np.arange(len(items)), len(items) // 50)
    threads = []
    thread_count = 2

    for index in range(len(indexes)):
        indexes_ = indexes[index]

        thread = Reducer(database, collection_name, indexes_, items)
        thread.start()
        threads.append(thread)

        if (index % thread_count == 0 and index != 0):
            for thread in threads:
                thread.join()

            threads = []
Beispiel #11
0
    with open(input_file) as reader:
        all_data = json.loads(reader.read())
        data = all_data['records']
    
    # mapped to country

    mapper = Mapper()
    if sys.argv[2]:
        query_continent = sys.argv[2]
        country_data = mapper.map_continent(data, query_continent)
    else:
        country_data = mapper.map_continent(data,"all")
    
    # get stats of each country
    reducer = Reducer()
    country_stats = reducer.get_country_stats(country_data)
    country_deaths_counts, country_cases_counts, country_mortality_counts, country_infection_counts = reducer.slice_country_stats(country_stats)

    # sort the data
    final_result = {}
    sorted_deaths = reducer.sort(country_deaths_counts, 10)
    final_result['Top_Deaths'] = sorted_deaths
    sorted_cases = reducer.sort(country_cases_counts,10)
    final_result['Top_Cases'] = sorted_cases
    sorted_infection = reducer.sort(country_infection_counts,10)
    final_result['Top_Infection'] = sorted_infection
    sorted_mortality = reducer.sort(country_mortality_counts,10)
    final_result['Top_Mortality'] = sorted_mortality

    output_json = json.dumps(final_result, indent=4)
Beispiel #12
0
def parse_tokens(toklist, mim_tags, fast_p):
    """ Parse the given token list and return a result dict """

    # Count sentences
    num_sent = 0
    num_parsed_sent = 0
    total_ambig = 0.0
    total_tokens = 0
    sent = []
    sent_begin = 0
    tag_ix = 0
    ntags = len(mim_tags)

    rdc = Reducer(fast_p.grammar)

    for ix, t in enumerate(toklist):
        if t[0] == TOK.S_BEGIN:
            num_sent += 1
            sent = []
            sent_begin = ix
        elif t[0] == TOK.S_END:
            slen = len(sent)
            if slen:
                # Parse the accumulated sentence
                err_index = None
                num = 0 # Number of tree combinations in forest
                score = 0 # Reducer score of the best parse tree

                try:
                    # Progress indicator: sentence count
                    print("{}".format(num_sent), end="\r")
                    # Parse the sentence
                    forest = fast_p.go(sent)
                    if forest:
                        num = Fast_Parser.num_combinations(forest)

                    if num > 1:
                        # Reduce the resulting forest
                        forest = rdc.go(forest)

                except ParseError as e:
                    forest = None
                    num = 0
                    # Obtain the index of the offending token
                    err_index = e.token_index

                if num > 0:
                    num_parsed_sent += 1

                    # Extract the POS tags for the terminals in the forest
                    pos_tags = find_pos_tags(forest)

                    # Calculate the 'ambiguity factor'
                    ambig_factor = num ** (1 / slen)
                    # Do a weighted average on sentence length
                    total_ambig += ambig_factor * slen
                    total_tokens += slen
                # Mark the sentence beginning with the number of parses
                # and the index of the offending token, if an error occurred
                toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index)
        elif t[0] == TOK.P_BEGIN:
            pass
        elif t[0] == TOK.P_END:
            pass
        else:
            sent.append(t)
            # Check whether the token streams are in sync
            if tag_ix < ntags and t[1] != mim_tags[tag_ix][1]:
                #print("Warning: mismatch between MIM token '{0}' and Greynir token '{1}'".format(mim_tags[tag_ix][1], t[1]))
                # Attempt to sync again by finding the Greynir token in the MIM tag stream
                gap = 1
                MAX_LOOKAHEAD = 4
                while gap < MAX_LOOKAHEAD and (tag_ix + gap) < ntags and mim_tags[tag_ix + gap][1] != t[1]:
                    gap += 1
                if gap < MAX_LOOKAHEAD:
                    # Found the Greynir token ahead
                    #print("Re-synced by skipping ahead by {0} tokens".format(gap))
                    tag_ix += gap
            if tag_ix < ntags:
                tag_ix += 1

    return dict(
        tokens = toklist,
        tok_num = len(toklist),
        num_sent = num_sent,
        num_parsed_sent = num_parsed_sent,
        avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0
    )
Beispiel #13
0
def parse_tokens(toklist, mim_tags, fast_p):
    """ Parse the given token list and return a result dict """

    # Count sentences
    num_sent = 0
    num_parsed_sent = 0
    total_ambig = 0.0
    total_tokens = 0
    sent = []
    sent_begin = 0
    tag_ix = 0
    ntags = len(mim_tags)

    rdc = Reducer(fast_p.grammar)

    for ix, t in enumerate(toklist):
        if t[0] == TOK.S_BEGIN:
            num_sent += 1
            sent = []
            sent_begin = ix
        elif t[0] == TOK.S_END:
            slen = len(sent)
            if slen:
                # Parse the accumulated sentence
                err_index = None
                num = 0 # Number of tree combinations in forest
                score = 0 # Reducer score of the best parse tree

                try:
                    # Progress indicator: sentence count
                    print("{}".format(num_sent), end="\r")
                    # Parse the sentence
                    forest = fast_p.go(sent)
                    if forest:
                        num = Fast_Parser.num_combinations(forest)

                    if num > 1:
                        # Reduce the resulting forest
                        forest = rdc.go(forest)

                except ParseError as e:
                    forest = None
                    # Obtain the index of the offending token
                    err_index = e.token_index

                if num > 0:
                    num_parsed_sent += 1

                    # Extract the POS tags for the terminals in the forest
                    pos_tags = find_pos_tags(forest)

                    # Calculate the 'ambiguity factor'
                    ambig_factor = num ** (1 / slen)
                    # Do a weighted average on sentence length
                    total_ambig += ambig_factor * slen
                    total_tokens += slen
                # Mark the sentence beginning with the number of parses
                # and the index of the offending token, if an error occurred
                toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index)
        elif t[0] == TOK.P_BEGIN:
            pass
        elif t[0] == TOK.P_END:
            pass
        else:
            sent.append(t)
            # Check whether the token streams are in sync
            if tag_ix < ntags and t[1] != mim_tags[tag_ix][1]:
                print("Warning: mismatch between MIM token '{0}' and Reynir token '{1}'".format(mim_tags[tag_ix][1], t[1]))
                # Attempt to sync again by finding the Reynir token in the MIM tag stream
                gap = 1
                MAX_LOOKAHEAD = 3
                while gap < MAX_LOOKAHEAD and (tag_ix + gap) < ntags and mim_tags[tag_ix + gap][1] != t[1]:
                    gap += 1
                if gap < MAX_LOOKAHEAD:
                    # Found the Reynir token ahead
                    print("Re-synced by skipping ahead by {0} tokens".format(gap))
                    tag_ix += gap
            if tag_ix < ntags:
                tag_ix += 1

    return dict(
        tokens = toklist,
        tok_num = len(toklist),
        num_sent = num_sent,
        num_parsed_sent = num_parsed_sent,
        avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0
    )
Beispiel #14
0
    def execute(self, map_func, reduce_func, kill_idx=-1):
        '''
			Executes the Master worker to complete the MapReduce task
			Args:
				1. map_func - handle for UDF map function
				2. reduce_func - handle for UDF reduce function
				3. kill_idx - specifies the worker to be killed; used to simulate fault tolerance when >= 0
		'''

        # Logic for coordinating mappers and reducer
        self.mappers = []
        self.reducers = []
        self.active_reducers = []

        #instantiate mappers
        for idx in range(len(self.input_file_paths)):
            self.mappers.append(
                Mapper(idx, self.R, self.input_file_paths[idx],
                       f'{self.TMP_DIR}/intermediate', map_func))

        # NOTE: Keeping this for future exextuion time comparison
        # for m in mappers:
        # 	m.execute_map()
        # 	while (m.status != 'DONE'):
        # 		continue
        # 	self.active_reducers = self.active_reducers | m.reducer_ids
        # 	print('MAPPER {} finished executing'.format(m.id+1)) #, m.id, m.status)

        print("Map phase:")
        self.phase_flag = 0
        #instantiate processes for map phase
        self.processes = [None] * self.M
        self.reducer_ids = [None] * self.M
        self.ps, self.cs = [None] * self.M, [None] * self.M
        self.mapper_status = [True] * self.M
        self.attempts = [0] * self.M

        for i, m in enumerate(self.mappers):

            #queue used for message passing
            self.reducer_ids[i] = mp.Queue()
            # ps[i], cs[i] = mp.Pipe()
            self.cs[i] = mp.Queue()
            self.processes[i] = mp.Process(target=m.execute_map,
                                           args=(self.reducer_ids[i],
                                                 self.cs[i]))
            #execute mapper
            self.processes[i].start()
            #simulate process crash to test fault tolerance
            if (kill_idx == i):
                print(f"Killing process {i}")
                self.processes[i].kill()

        # Code for testing fault tolerance timeout
        if (kill_idx == -2):
            print(f"Killing process 1")
            self.processes[1].kill()

        #wait until all mappers have finished
        #mapping_status: Checks if phase is complete
        mapping_status = False
        while (mapping_status == False):
            mapping_status = True
            for i, m in enumerate(self.mappers):
                curr_status = None
                while True:
                    try:
                        #heartbeat message
                        [curr_status,
                         timestamp] = self.cs[i].get(timeout=self.timeout)
                        break
                    except:
                        #no message received, check if max attempts reached
                        if (self.attempts[i] < self.max_attempts):
                            # restart replacement worker, increment attempt count
                            self.restart_process(i, self.M, kill_idx)
                            self.attempts[i] += 1
                        else:
                            for i, m in enumerate(self.mappers):
                                self.processes[i].kill()
                            raise ValueError(
                                "RETRY_ERROR: Maximum attempts reached, job failed"
                            )

            #check status received
            if curr_status == 'DONE' and self.mapper_status[i] == True:
                self.mapper_status[i] = False
                #get all valid reducer_ids
                self.active_reducers += self.reducer_ids[i].get()
                #wait until all processes have been completed
                self.processes[i].join()
            else:
                mapping_status = False

        print("\nAll mappers have finished executing")
        print("\nReduce phase:")
        self.phase_flag = 1
        # NOTE: Keeping this for future exextuion time comparison
        # for r in reducer:
        # 	r.execute_reduce()
        # 	while (r.status != 'DONE'):
        # 		continue
        # 	print('REDUCER {} finished executing'.format(r.id+1))#, r.id, r.status)

        #similar to map phase, instantiate all reducers and processes
        self.active_reducers = (list(set(self.active_reducers)))
        self.processes = [None] * self.R
        self.ps, self.cs = [None] * self.R, [None] * self.R
        self.reducer_status = [True] * len(self.active_reducers)

        for idx in (self.active_reducers):
            self.reducers.append(
                Reducer(idx, len(self.input_file_paths),
                        f'{self.TMP_DIR}/intermediate', self.OUT_DIR,
                        reduce_func))

        #setting up processes for reducers
        for i, r in enumerate(self.reducers):
            self.cs[i] = mp.Queue()
            self.processes[i] = mp.Process(target=r.execute_reduce,
                                           args=(self.cs[i], ))
            self.processes[i].start()
            #killing certain workers to test fault tolerance
            if (kill_idx == i):
                print(f"Killing process {i+1}")
                self.processes[i].kill()

        #check for heartbeat messages, similar to map phase
        reducing_status = False
        while reducing_status == False:
            reducing_status = True
            for i, r in enumerate(self.reducers):
                curr_status = None
                while True:
                    try:
                        #print(self.reducer_status[i])
                        if (self.reducer_status[i] is True):
                            [curr_status,
                             timestamp] = self.cs[i].get(timeout=self.timeout)
                        break
                    except:
                        if (self.attempts[i] < self.max_attempts):
                            self.restart_process(i, self.R, kill_idx)
                            self.attempts[i] += 1
                        else:
                            print("Max attempts reached, task not completed")
                            for i, m in enumerate(self.reducers):
                                self.processes[i].kill()
                            raise ValueError(
                                "TIMEOUT ERROR: Max attempts reached, task not completed"
                            )

                if curr_status == 'DONE' and self.reducer_status[i] == True:
                    self.reducer_status[i] = False
                    self.processes[i].join()
                elif curr_status == 'RUNNING':
                    reducing_status = False

        print("\nAll reducing tasks have been completed")
Beispiel #15
0
 def clean(cls, reducer_cls=None):
     if reducer_cls == None:
         ReductionSingleton.__instance = Reducer()
     else:
         ReductionSingleton.__instance = reducer_cls()
Beispiel #16
0
def analyze():
    """ Find word categories in the submitted text """

    txt = request.form.get("txt", "").strip()

    # Tokenize the text entered as-is and return the token list
    toklist = list(tokenize(txt))

    # Count sentences
    num_sent = 0
    num_parsed_sent = 0
    total_ambig = 0.0
    total_tokens = 0
    sent = []
    sent_begin = 0

    with Fast_Parser(verbose=False) as bp:  # Don't emit diagnostic messages

        rdc = Reducer(bp.grammar)

        for ix, t in enumerate(toklist):
            if t[0] == TOK.S_BEGIN:
                num_sent += 1
                sent = []
                sent_begin = ix
            elif t[0] == TOK.S_END:
                slen = len(sent)
                if slen:
                    # Parse the accumulated sentence
                    err_index = None
                    num = 0  # Number of tree combinations in forest
                    try:
                        # Parse the sentence
                        forest = bp.go(sent)
                        if forest:
                            num = Fast_Parser.num_combinations(forest)

                        if num > 1:
                            # Reduce the resulting forest
                            forest = rdc.go(forest)
                            assert Fast_Parser.num_combinations(forest) == 1

                        # Mark the token list with the identified word categories
                        mark_categories(forest, toklist, sent_begin + 1)

                    except ParseError as e:
                        # Obtain the index of the offending token
                        err_index = e.token_index
                    print(
                        "Parsed sentence of length {0} with {1} combinations{2}"
                        .format(
                            slen, num, "\n" +
                            (" ".join(s[1]
                                      for s in sent) if num >= 100 else "")))
                    if num > 0:
                        num_parsed_sent += 1
                        # Calculate the 'ambiguity factor'
                        ambig_factor = num**(1 / slen)
                        # Do a weighted average on sentence length
                        total_ambig += ambig_factor * slen
                        total_tokens += slen
                    # Mark the sentence beginning with the number of parses
                    # and the index of the offending token, if an error occurred
                    toklist[sent_begin] = TOK.Begin_Sentence(
                        num_parses=num, err_index=err_index)
            elif t[0] == TOK.P_BEGIN:
                pass
            elif t[0] == TOK.P_END:
                pass
            else:
                sent.append(t)

    result = dict(tokens=toklist,
                  tok_num=len(toklist),
                  num_sent=num_sent,
                  num_parsed_sent=num_parsed_sent,
                  avg_ambig_factor=(total_ambig /
                                    total_tokens) if total_tokens > 0 else 1.0)

    # Return the tokens as a JSON structure to the client
    return jsonify(result=result)
Beispiel #17
0
def main():
    nodeId = sys.argv[1]
    ip = sys.argv[2]

    #instantiate reduce
    reduce = Reducer(nodeId, ip, 5004)
Beispiel #18
0
################
print '------------------'
print('Global grouping...')
print '------------------'
listOfDirectory = []
globalGrouperDirectory = '/Users/lcambier/TempMapReduce/mapper_and_groupper_logs2/'
for i in range(0,totalNumberOfGrouper):
    listOfDirectory.append('/Users/lcambier/TempMapReduce/mapper_and_groupper_logs2/')
globalDict = Grouper.globalGrouper(saveStateNameGrouper,listGrouperNum,listLastCallNum,listOfDirectory,globalGrouperDirectory)
print('Global grouping done.')

############
# Reducing #
############
print '------------'
print('Reducing ...')
print '------------'
outputDict = dict()
for key, globalNodeFileName in globalDict.iteritems():
    reduceIterator = ReduceFromGroupIterator(globalNodeFileName)
    theReduceContext = ReduceContext(key,reduceIterator)
    outputDict[key] = Reducer.reduce(theReduceContext)
print('Reducing done.')

##########
# OUTPUT #
##########
print '\n------------------------------\nOutput\n------------------------------\n'
for key in outputDict :
    print str(key) + ' - ' + str(outputDict[key])
# print 'apta : ' + str(outputDict['apta']) + ' vs 7'
Beispiel #19
0
class Gui:
    def __init__(self, rule_list, random_forest):
        self.random_forest = random_forest
        self.reducer = Reducer(rule_list, self.random_forest)

        # self.red_ruleset = rule_list
        # self.new_ruleset = rule_list

    @staticmethod
    def print_rule(rule, feature_names):
        ret = 'if '
        for i in range(0, len(rule) - 1):
            if rule[i][1] == 'l':
                lower_greater = '<='
            else:
                lower_greater = '>'
            ret += feature_names[rule[i][0]] + " " + lower_greater + " " + str(
                rule[i][2])
            if i < len(rule) - 2:
                ret += ' and '
            else:
                ret += ' then '

        if rule[len(rule) - 1][0] > rule[len(rule) - 1][1]:
            ret += '\nHEALTHY!'
        else:
            ret += '\nDISEASED!'

        return ret

    @staticmethod
    def print_all_rules(ruleset, feature_names):
        ret = ''
        for rule in ruleset:
            ret += Gui.print_rule(rule, feature_names)
            ret += ' \n '

        return ret

    # implementation of the GUI
    # feature_names: list of strings with all feature names
    # ruleset: array of array of arrays with all rules
    # X_train: dataframe with all data samples of training set
    # y_train: ground truth of X_train as array
    # X_test: dataframe with all data samples of test set
    # y_test: ground truth of X_test as array
    def window(self, feature_names, data_set_name, ruleset, X_train, y_train,
               X_test, y_test):
        global red_ruleset
        global new_ruleset

        red_ruleset = ruleset
        new_ruleset = ruleset

        feature_info = "Please name your favourite features. Rules containing them will be less likely to be deleted " \
                       "from the rule set. You can name as many as you want. The order matters: the first feature is " \
                       "treated as the most preferred one. Please separate the features with a comma. An example would " \
                       "be: \n \t 1,2,3"
        perc_info = "Pleas name the percentage of the size of the original rule set, you would like the reduced rule set " \
                    "to have. Please only type in the number, without the percent sign. An example would be: \n \t 30"

        # eliminate useless queries within a rule
        def first_reduction():
            global red_ruleset
            global new_ruleset

            red_ruleset = self.reducer.reduce_rules()
            red1_label.config(text="new rule size: " + str(len(red_ruleset)))

        # reduce the rule set based on given percentage and preferred features
        def reduce_action():
            global new_ruleset
            global red_ruleset

            features = eingabefeld.get()
            percentage = entrytext.get()

            if features == "":
                features = []
            else:
                features = helpers.string_to_int_list(features)

            if percentage == "":
                reduce_label.config(text="no percentage set")
            else:
                numtoelim = int(
                    (1 - (int(percentage) / 100)) * len(red_ruleset))
                new_ruleset = self.reducer.eliminate_weakest_rules_2(
                    favourite_features=features,
                    k=4,
                    numtoelim=numtoelim,
                    ruleset=red_ruleset,
                    xtrain=X_train,
                    ytrain=y_train)
                vector_pred = self.random_forest.apply_ruleset_get_vector_new(
                    ruleset=new_ruleset, xtest=X_test)

                if DEBUG:
                    print("gui: vector pred len: %s" % len(vector_pred))
                    print("gui: y_test len: %s" % len(y_test))

                acc = self.random_forest.get_accuracy_of_ruleset_new(
                    ruleset=new_ruleset, xtest=X_test, ytest=y_test)

                spec = helpers.get_specificity(reslist=vector_pred,
                                               truevals=y_test)
                if DEBUG:
                    print("gui: spec: %s" % spec)
                sens = helpers.get_sensitivity(reslist=vector_pred,
                                               truevals=y_test)

                reduce_label.config(text="New Rule Size:  " +
                                    str(len(new_ruleset)))
                acc_label.config(text="Accuracy: " + str(acc) +
                                 ", Sensitivity: " + str(sens) +
                                 ", Specificity: " + str(spec))

        def predict_action():
            global new_ruleset
            global red_ruleset

            f0_text = e_f0.get()
            f1_text = e_f1.get()
            f2_text = e_f2.get()
            f3_text = e_f3.get()
            f4_text = e_f4.get()
            f5_text = e_f5.get()
            f6_text = e_f6.get()
            f7_text = e_f7.get()
            f8_text = e_f8.get()
            f9_text = e_f9.get()
            f10_text = e_f10.get()
            f11_text = e_f11.get()
            f12_text = e_f12.get()
            f13_text = e_f13.get()
            f14_text = e_f14.get()
            f15_text = e_f15.get()
            f16_text = e_f16.get()
            f17_text = e_f17.get()
            f18_text = e_f18.get()
            f19_text = e_f19.get()
            f20_text = e_f20.get()
            f21_text = e_f21.get()

            if ((f0_text == "") | (f1_text == "") | (f2_text == "") |
                (f3_text == "") | (f4_text == "") | (f5_text == "") |
                (f6_text == "") | (f7_text == "") | (f8_text == "") |
                (f9_text == "") | (f10_text == "") | (f11_text == "") |
                (f12_text == "") | (f13_text == "") | (f14_text == "") |
                (f15_text == "") | (f16_text == "") | (f17_text == "") |
                (f18_text == "") | (f19_text == "") | (f20_text == "") |
                (f21_text == "")):
                predict_label.config(text="not all features set")
            else:
                vec = [
                    float(f0_text),
                    float(f1_text),
                    float(f2_text),
                    float(f3_text),
                    float(f4_text),
                    float(f5_text),
                    float(f6_text),
                    float(f7_text),
                    float(f8_text),
                    float(f9_text),
                    float(f10_text),
                    float(f11_text),
                    float(f12_text),
                    float(f13_text),
                    float(f14_text),
                    float(f15_text),
                    float(f16_text),
                    float(f17_text),
                    float(f18_text),
                    float(f19_text),
                    float(f20_text),
                    float(f21_text)
                ]

                df = pd.DataFrame([vec], columns=feature_names)
                pred = self.random_forest.apply_ruleset_get_vector_new(
                    ruleset=new_ruleset, xtest=df)

                if pred[0] == 0:
                    string = "HEALTHY!"
                else:
                    string = "ALZHEIMERS DISEASE"

                predict_label.config(text="Prediction:  " + string + "!")

        def message_features():
            tkMessageBox.showinfo("Favourite Features", feature_info)

        def message_percentage():
            tkMessageBox.showinfo("Percentage", perc_info)

        def print_rules_():
            win = Toplevel(fenster)
            win.title("All Rules in Reduced Rule Set")
            scroll = Scrollbar(win)
            # scroll.pack(side = RIGHT, fill = Y)
            scroll.grid(row=0, column=1, sticky=N + S)

            txt = Text(win,
                       wrap=WORD,
                       yscrollcommand=scroll.set,
                       xscrollcommand=scroll.set)
            txt.grid(row=0, column=0, sticky=N + S + E + W)
            # txt.insert(INSERT, build_string_ruleset(ruleset=self.new_ruleset, featurenames=feature_names))
            txt.insert(INSERT, Gui.print_all_rules(new_ruleset, feature_names))
            # txt.insert(INSERT, "TEST")

            scroll.config(command=txt.yview)

        def bar_chart_orig_rules():
            global new_ruleset
            global red_ruleset

            wind = Toplevel(fenster)
            wind.title(
                "Number of rules containing respective features in original rule set"
            )

            f = Figure(figsize=(5, 4), dpi=100)
            ax = f.add_subplot(111)

            data = helpers.get_number_feat_in_rules(ruleset=red_ruleset,
                                                    features=range(0, 22))

            ind = np.arange(22)
            width = .5

            rects1 = ax.bar(ind, data, width)

            canvas = FigureCanvasTkAgg(f, master=wind)
            canvas.draw()
            canvas.get_tk_widget().pack(side=TOP, fill=BOTH, expand=1)

        def bar_chart_red_rules():
            global new_ruleset
            global red_ruleset

            wind = Toplevel(fenster)
            wind.title(
                "Number of rules containing respective features in reduced rule set"
            )

            f = Figure(figsize=(5, 4), dpi=100)
            ax = f.add_subplot(111)

            data = helpers.get_number_feat_in_rules(ruleset=new_ruleset,
                                                    features=range(0, 22))

            ind = np.arange(22)  # the x locations for the groups
            width = .5

            rects1 = ax.bar(ind, data, width)

            canvas = FigureCanvasTkAgg(f, master=wind)
            canvas.draw()
            canvas.get_tk_widget().pack(side=TOP, fill=BOTH, expand=1)

        # creating main window
        fenster = Tk()
        fenster.title("Decision Support")

        # information labels
        dataset = Label(fenster, text=data_set_name)
        numrules = Label(fenster, text="Number of Rules: " + str(len(ruleset)))
        feat_label = Label(fenster, text="Favourite Features (optional) ")
        perc_label = Label(fenster, text="Percentage")
        label_f0 = Label(fenster, text=feature_names[0])
        label_f1 = Label(fenster, text=feature_names[1])
        label_f2 = Label(fenster, text=feature_names[2])
        label_f3 = Label(fenster, text=feature_names[3])
        label_f4 = Label(fenster, text=feature_names[4])
        label_f5 = Label(fenster, text=feature_names[5])
        label_f6 = Label(fenster, text=feature_names[6])
        label_f7 = Label(fenster, text=feature_names[7])
        label_f8 = Label(fenster, text=feature_names[8])
        label_f9 = Label(fenster, text=feature_names[9])
        label_f10 = Label(fenster, text=feature_names[10])
        label_f11 = Label(fenster, text=feature_names[11])
        label_f12 = Label(fenster, text=feature_names[12])
        label_f13 = Label(fenster, text=feature_names[13])
        label_f14 = Label(fenster, text=feature_names[14])
        label_f15 = Label(fenster, text=feature_names[15])
        label_f16 = Label(fenster, text=feature_names[16])
        label_f17 = Label(fenster, text=feature_names[17])
        label_f18 = Label(fenster, text=feature_names[18])
        label_f19 = Label(fenster, text=feature_names[19])
        label_f20 = Label(fenster, text=feature_names[20])
        label_f21 = Label(fenster, text=feature_names[21])

        red1_label = Label(fenster)
        reduce_label = Label(fenster)
        predict_label = Label(fenster)
        acc_label = Label(fenster)

        # Here the user can enter something
        eingabefeld = Entry(fenster, bd=5, width=40)
        entrytext = Entry(fenster, bd=5, width=40)
        e_f0 = Entry(fenster, bd=5, width=8)
        e_f1 = Entry(fenster, bd=5, width=8)
        e_f2 = Entry(fenster, bd=5, width=8)
        e_f3 = Entry(fenster, bd=5, width=8)
        e_f4 = Entry(fenster, bd=5, width=8)
        e_f5 = Entry(fenster, bd=5, width=8)
        e_f6 = Entry(fenster, bd=5, width=8)
        e_f7 = Entry(fenster, bd=5, width=8)
        e_f8 = Entry(fenster, bd=5, width=8)
        e_f9 = Entry(fenster, bd=5, width=8)
        e_f10 = Entry(fenster, bd=5, width=8)
        e_f11 = Entry(fenster, bd=5, width=8)
        e_f12 = Entry(fenster, bd=5, width=8)
        e_f13 = Entry(fenster, bd=5, width=8)
        e_f14 = Entry(fenster, bd=5, width=8)
        e_f15 = Entry(fenster, bd=5, width=8)
        e_f16 = Entry(fenster, bd=5, width=8)
        e_f17 = Entry(fenster, bd=5, width=8)
        e_f18 = Entry(fenster, bd=5, width=8)
        e_f19 = Entry(fenster, bd=5, width=8)
        e_f20 = Entry(fenster, bd=5, width=8)
        e_f21 = Entry(fenster, bd=5, width=8)

        reduce_rule_set_button = Button(fenster,
                                        text="Reduce Rule Set",
                                        command=reduce_action)
        predict_button = Button(fenster,
                                text="Predict",
                                command=predict_action)
        red1_button = Button(fenster,
                             text="First Reduction",
                             command=first_reduction)

        bar_chart_orig_button = Button(
            fenster,
            text="Show Features in Original Rule Set",
            command=bar_chart_orig_rules)
        bar_chart_red_button = Button(fenster,
                                      text="Show Features in Reduced Rule Set",
                                      command=bar_chart_red_rules)

        info_feat_button = Button(fenster,
                                  text="more info",
                                  command=message_features)
        info_perc_button = Button(fenster,
                                  text="more info",
                                  command=message_percentage)
        info_rules_button = Button(fenster,
                                   text="Print Rules",
                                   command=print_rules_)

        dataset.grid(row=0, column=0, columnspan=5)
        numrules.grid(row=0, column=6, columnspan=5)

        feat_label.grid(row=4, column=2, columnspan=3)
        perc_label.grid(row=5, column=2, columnspan=3)
        eingabefeld.grid(row=4, column=4, columnspan=5)
        reduce_rule_set_button.grid(row=6, column=1, columnspan=9)
        entrytext.grid(row=5, column=4, columnspan=5)
        predict_button.grid(row=12, column=1, columnspan=9)
        info_rules_button.grid(row=15, column=1, columnspan=9)
        # exit_button.grid(row = 4, column = 1)
        reduce_label.grid(row=7, column=0, columnspan=3)
        predict_label.grid(row=13, column=1, columnspan=9)
        acc_label.grid(row=7, column=3, columnspan=8)

        red1_button.grid(row=2, column=1, columnspan=9)
        red1_label.grid(row=3, column=1, columnspan=9)

        bar_chart_orig_button.grid(row=17, column=0, columnspan=5)
        bar_chart_red_button.grid(row=17, column=6, columnspan=5)

        info_feat_button.grid(row=4, column=9)
        info_perc_button.grid(row=5, column=9)

        label_f0.grid(row=8, column=0)
        label_f1.grid(row=8, column=1)
        label_f2.grid(row=8, column=2)
        label_f3.grid(row=8, column=3)
        label_f4.grid(row=8, column=4)
        label_f5.grid(row=8, column=5)
        label_f6.grid(row=8, column=6)
        label_f7.grid(row=8, column=7)
        label_f8.grid(row=8, column=8)
        label_f9.grid(row=8, column=9)
        label_f10.grid(row=8, column=10)
        label_f11.grid(row=10, column=0)
        label_f12.grid(row=10, column=1)
        label_f13.grid(row=10, column=2)
        label_f14.grid(row=10, column=3)
        label_f15.grid(row=10, column=4)
        label_f16.grid(row=10, column=5)
        label_f17.grid(row=10, column=6)
        label_f18.grid(row=10, column=7)
        label_f19.grid(row=10, column=8)
        label_f20.grid(row=10, column=9)
        label_f21.grid(row=10, column=10)

        e_f0.grid(row=9, column=0)
        e_f1.grid(row=9, column=1)
        e_f2.grid(row=9, column=2)
        e_f3.grid(row=9, column=3)
        e_f4.grid(row=9, column=4)
        e_f5.grid(row=9, column=5)
        e_f6.grid(row=9, column=6)
        e_f7.grid(row=9, column=7)
        e_f8.grid(row=9, column=8)
        e_f9.grid(row=9, column=9)
        e_f10.grid(row=9, column=10)
        e_f11.grid(row=11, column=0)
        e_f12.grid(row=11, column=1)
        e_f13.grid(row=11, column=2)
        e_f14.grid(row=11, column=3)
        e_f15.grid(row=11, column=4)
        e_f16.grid(row=11, column=5)
        e_f17.grid(row=11, column=6)
        e_f18.grid(row=11, column=7)
        e_f19.grid(row=11, column=8)
        e_f20.grid(row=11, column=9)
        e_f21.grid(row=11, column=10)

        fenster.mainloop()
Beispiel #20
0
def parse_grid():
    """ Show the parse grid for a particular parse tree of a sentence """

    MAX_LEVEL = 32  # Maximum level of option depth we can handle
    txt = request.form.get('txt', "")
    parse_path = request.form.get('option', "")
    debug_mode = get_json_bool(request, 'debug')
    use_reducer = not ("noreduce" in request.form)

    # Tokenize the text
    tokens = list(tokenize(txt))

    # Parse the text
    with Fast_Parser(verbose=False) as bp:  # Don't emit diagnostic messages
        err = dict()
        grammar = bp.grammar
        try:
            forest = bp.go(tokens)
        except ParseError as e:
            err["msg"] = str(e)
            # Relay information about the parser state at the time of the error
            err["info"] = None  # e.info
            forest = None

    # Find the number of parse combinations
    combinations = 0 if forest is None else Fast_Parser.num_combinations(
        forest)
    score = 0

    if Settings.DEBUG:
        # Dump the parse tree to parse.txt
        with open("parse.txt", mode="w", encoding="utf-8") as f:
            if forest is not None:
                print("Reynir parse forest for sentence '{0}'".format(txt),
                      file=f)
                print("{0} combinations\n".format(combinations), file=f)
                if combinations < 10000:
                    ParseForestPrinter.print_forest(forest, file=f)
                else:
                    print("Too many combinations to dump", file=f)
            else:
                print("No parse available for sentence '{0}'".format(txt),
                      file=f)

    if forest is not None and use_reducer:
        # Reduce the parse forest
        forest, score = Reducer(grammar).go_with_score(forest)
        if Settings.DEBUG:
            # Dump the reduced tree along with node scores
            with open("reduce.txt", mode="w", encoding="utf-8") as f:
                print("Reynir parse tree for sentence '{0}' after reduction".
                      format(txt),
                      file=f)
                ParseForestPrinter.print_forest(forest, file=f)

    # Make the parse grid with all options
    grid, ncols = make_grid(forest) if forest else ([], 0)
    # The grid is columnar; convert it to row-major
    # form for convenient translation into HTML
    # There will be as many columns as there are tokens
    nrows = len(grid)
    tbl = [[] for _ in range(nrows)]
    # Info about previous row spans
    rs = [[] for _ in range(nrows)]

    # The particular option path we are displaying
    if not parse_path:
        # Not specified: display the all-zero path
        path = [(0, ) * i for i in range(1, MAX_LEVEL)]
    else:
        # Disassemble the passed-in path

        def toint(s):
            """ Safe conversion of string to int """
            try:
                n = int(s)
            except ValueError:
                n = 0
            return n if n >= 0 else 0

        p = [toint(s) for s in parse_path.split("_")]
        path = [tuple(p[0:i + 1]) for i in range(len(p))]

    # This set will contain all option path choices
    choices = set()
    NULL_TUPLE = tuple()

    for gix, gcol in enumerate(grid):
        # gcol is a dictionary of options
        # Accumulate the options that we want do display
        # according to chosen path
        cols = gcol[NULL_TUPLE] if NULL_TUPLE in gcol else [
        ]  # Default content
        # Add the options we're displaying
        for p in path:
            if p in gcol:
                cols.extend(gcol[p])
        # Accumulate all possible path choices
        choices |= gcol.keys()
        # Sort the columns that will be displayed
        cols.sort(key=lambda x: x[0])
        col = 0
        for startcol, endcol, info in cols:
            #assert isinstance(info, Nonterminal) or isinstance(info, tuple)
            if col < startcol:
                gap = startcol - col
                gap -= sum(1 for c in rs[gix] if c < startcol)
                if gap > 0:
                    tbl[gix].append((gap, 1, "", ""))
            rowspan = 1
            if isinstance(info, tuple):
                cls = {"terminal"}
                rowspan = nrows - gix
                for i in range(gix + 1, nrows):
                    # Note the rowspan's effect on subsequent rows
                    rs[i].append(startcol)
            else:
                cls = {"nonterminal"}
                # Get the 'pure' name of the nonterminal in question
                #assert isinstance(info, Nonterminal)
                info = info.name
            if endcol - startcol == 1:
                cls |= {"vertical"}
            tbl[gix].append((endcol - startcol, rowspan, info, cls))
            col = endcol
        ncols_adj = ncols - len(rs[gix])
        if col < ncols_adj:
            tbl[gix].append((ncols_adj - col, 1, "", ""))
    # Calculate the unique path choices available for this parse grid
    choices -= {NULL_TUPLE}  # Default choice: don't need it in the set
    unique_choices = choices.copy()
    for c in choices:
        # Remove all shorter prefixes of c from the unique_choices set
        unique_choices -= {c[0:i] for i in range(1, len(c))}
    # Create a nice string representation of the unique path choices
    uc_list = ["_".join(str(c) for c in choice) for choice in unique_choices]
    if not parse_path:
        # We are displaying the longest possible all-zero choice: find it
        i = 0
        while (0, ) * (i + 1) in unique_choices:
            i += 1
        parse_path = "_".join(["0"] * i)

    return render_template("parsegrid.html",
                           txt=txt,
                           err=err,
                           tbl=tbl,
                           combinations=combinations,
                           score=score,
                           debug_mode=debug_mode,
                           choice_list=uc_list,
                           parse_path=parse_path)
Beispiel #21
0
 def __init__(self, rule_list, random_forest):
     self.random_forest = random_forest
     self.reducer = Reducer(rule_list, self.random_forest)
Beispiel #22
0
def parse(toklist, single, use_reducer, dump_forest = False, keep_trees = False):
    """ Parse the given token list and return a result dict """

    # Count sentences
    num_sent = 0
    num_parsed_sent = 0
    total_ambig = 0.0
    total_tokens = 0
    sent = []
    sent_begin = 0

    # Accumulate parsed sentences in a text dump format
    trees = OrderedDict()

    with Fast_Parser(verbose = False) as bp: # Don't emit diagnostic messages

        version = bp.version
        rdc = Reducer(bp.grammar)

        for ix, t in enumerate(toklist):
            if t[0] == TOK.S_BEGIN:
                num_sent += 1
                sent = []
                sent_begin = ix
            elif t[0] == TOK.S_END:
                slen = len(sent)
                if slen:
                    # Parse the accumulated sentence
                    err_index = None
                    num = 0 # Number of tree combinations in forest
                    score = 0 # Reducer score of the best parse tree

                    try:
                        # Parse the sentence
                        forest = bp.go(sent)
                        if forest:
                            num = Fast_Parser.num_combinations(forest)

                            if single and dump_forest:
                                # Dump the parse tree to parse.txt
                                with open("parse.txt", mode = "w", encoding= "utf-8") as f:
                                    print("Reynir parse tree for sentence '{0}'".format(" ".join(sent)), file = f)
                                    print("{0} combinations\n".format(num), file = f)
                                    if num < 10000:
                                        ParseForestPrinter.print_forest(forest, file = f)
                                    else:
                                        print("Too many combinations to dump", file = f)

                        if use_reducer and num > 1:
                            # Reduce the resulting forest
                            forest, score = rdc.go_with_score(forest)
                            assert Fast_Parser.num_combinations(forest) == 1

                            if Settings.DEBUG:
                                print(ParseForestDumper.dump_forest(forest))

                            num = 1

                    except ParseError as e:
                        forest = None
                        # Obtain the index of the offending token
                        err_index = e.token_index

                    if Settings.DEBUG:
                        print("Parsed sentence of length {0} with {1} combinations, score {2}{3}"
                            .format(slen, num, score,
                                "\n" + (" ".join(s[1] for s in sent) if num >= 100 else "")))
                    if num > 0:
                        num_parsed_sent += 1
                        # Calculate the 'ambiguity factor'
                        ambig_factor = num ** (1 / slen)
                        # Do a weighted average on sentence length
                        total_ambig += ambig_factor * slen
                        total_tokens += slen
                        if keep_trees:
                            # We want to keep the trees for further processing down the line:
                            # reduce and dump the best tree to text
                            if num > 1:
                                # Reduce the resulting forest before dumping it to text format
                                forest = rdc.go(forest)
                            trees[num_sent] = ParseForestDumper.dump_forest(forest)

                    # Mark the sentence beginning with the number of parses
                    # and the index of the offending token, if an error occurred
                    toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index)
            elif t[0] == TOK.P_BEGIN:
                pass
            elif t[0] == TOK.P_END:
                pass
            else:
                sent.append(t)

    result = dict(
        version = version,
        tokens = toklist,
        tok_num = len(toklist),
        num_sent = num_sent,
        num_parsed_sent = num_parsed_sent,
        avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0
    )

    # noinspection PyRedundantParentheses
    return (result, trees)
Beispiel #23
0
def analyze():
    """ Find word categories in the submitted text """

    txt = request.form.get("txt", "").strip()

    # Tokenize the text entered as-is and return the token list
    toklist = list(tokenize(txt))

    # Count sentences
    num_sent = 0
    num_parsed_sent = 0
    total_ambig = 0.0
    total_tokens = 0
    sent = []
    sent_begin = 0

    with Fast_Parser(verbose = False) as bp: # Don't emit diagnostic messages

        rdc = Reducer(bp.grammar)

        for ix, t in enumerate(toklist):
            if t[0] == TOK.S_BEGIN:
                num_sent += 1
                sent = []
                sent_begin = ix
            elif t[0] == TOK.S_END:
                slen = len(sent)
                if slen:
                    # Parse the accumulated sentence
                    err_index = None
                    num = 0 # Number of tree combinations in forest
                    try:
                        # Parse the sentence
                        forest = bp.go(sent)
                        if forest:
                            num = Fast_Parser.num_combinations(forest)

                        if num > 1:
                            # Reduce the resulting forest
                            forest = rdc.go(forest)
                            assert Fast_Parser.num_combinations(forest) == 1

                        # Mark the token list with the identified word categories
                        mark_categories(forest, toklist, sent_begin + 1)

                    except ParseError as e:
                        # Obtain the index of the offending token
                        err_index = e.token_index
                    print("Parsed sentence of length {0} with {1} combinations{2}".format(slen, num,
                        "\n" + (" ".join(s[1] for s in sent) if num >= 100 else "")))
                    if num > 0:
                        num_parsed_sent += 1
                        # Calculate the 'ambiguity factor'
                        ambig_factor = num ** (1 / slen)
                        # Do a weighted average on sentence length
                        total_ambig += ambig_factor * slen
                        total_tokens += slen
                    # Mark the sentence beginning with the number of parses
                    # and the index of the offending token, if an error occurred
                    toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index)
            elif t[0] == TOK.P_BEGIN:
                pass
            elif t[0] == TOK.P_END:
                pass
            else:
                sent.append(t)

    result = dict(
        tokens = toklist,
        tok_num = len(toklist),
        num_sent = num_sent,
        num_parsed_sent = num_parsed_sent,
        avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0
    )

    # Return the tokens as a JSON structure to the client
    return jsonify(result = result)