Example #1
0
def add(table):
    """
    Asks user for input and adds it into the table.

    Args:
        table: table to add new record to

    Returns:
        Table with a new record
    """
    labels = ['Title', 'manufacturer', 'price', 'in_stock']

    user_inp = common.check_user_inp(labels)

    user_input = ui.get_inputs(['Title', 'manufacturer', 'price', 'in_stock'],
                               "Please provide information")
    while common.is_number(user_input[2]) is False or common.is_number(
            user_input[3]) is False:
        ui.print_error_message('Error: Price and Stock value must be numbers')
        user_input = ui.get_inputs(
            ['Title', 'manufacturer', 'price', 'in_stock'],
            "Please provide information")
        continue
    new_id = common.generate_random(table)
    new_record = [new_id] + user_inp
    table += [new_record]
    data_manager.write_table_to_file('store/games.csv', table)
    return table
Example #2
0
def getYearCats(catList, article):
    '''
    Analyses the sv.Wikipedia categories in an article to isolate
    information related to birth/death
    :param catList: list of categories
    :param article: article being worked on
    :returns: (birth_year, death_year)
    '''
    birth = None
    death = None

    if not catList:
        print 'no category for "%s" or page did not exist' % article
    else:
        for c in catList:
            if c.lower().startswith(u'kategori:avlidna'):
                if common.is_number(c.strip()[-4:]):
                    death = int(c.strip()[-4:])
                else:
                    print u'odd year for %s: %s' % (article, c)
            elif c.lower().startswith(u'kategori:födda'):
                if common.is_number(c.strip()[-4:]):
                    birth = int(c.strip()[-4:])
                else:
                    print u'odd year for %s: %s' % (article, c)
    return (birth, death)
def add(table):
    """
    Asks user for input and adds it into the table.

    Args:
        table: table to add new record to

    Returns:
        Table with a new record
    """

    user_input = ui.get_inputs(['month', 'day', 'year', 'type', ' amount'],
                               "Please provide information")
    while common.is_number(user_input[2]) is False or common.is_number(
            user_input[3]) is False:
        ui.print_error_message('Error: Price and Stock value must be numbers')
        user_input = ui.get_inputs(['month', 'day', 'year', 'type', 'amount'],
                                   "Please provide information")
        continue
    new_id = common.generate_random(table)
    new_record = [new_id] + user_input
    table += [new_record]
    data_manager.write_table_to_file('accounting/items.csv', table)
    return table
    # your code
    '''user_input = ui.get_inputs(['month', 'day', 'year', 'type', 'amount'], "Please provide your personal information")
def value_checker(title):
    """Checks Values for Specials Titles"""

    keep_checking = True

    while keep_checking:
        value = ui.get_inputs([title], '')

        if title == "Price":
            if common.is_number(value[0]):
                return value

        elif title == "Day":
            if common.is_number(value[0]):
                number = int(value[0])
                if number > 0 and number < 32:
                    return value

        elif title == "Month":
            if common.is_number(value[0]):
                number = int(value[0])
                if number > 0 and number <= 12:
                    return value

        elif title == "Year":
            if common.is_number(value[0]):
                number = int(value[0])
                if number > 1990 and number <= 2100:
                    return value
        else:
            return value
def add(table):
    """
    Asks user for input and adds it into the table.

    Args:
        table: table to add new record to

    Returns:
        Table with a new record
    """

    # your code
    '''user_input = ui.get_inputs(['name', 'manufacturer', 'purchase_date', 'durability'],"Please provide your personal information")
    new_id = common.generate_random(table)
    new_record = [new_id] + user_input
    table += [new_record] 
    data_manager.write_table_to_file('inventory/inventory.csv', table)
    return table'''
    user_input = ui.get_inputs(['name', 'manufacturer', 'purchase_date', 'durability'],"Please provide information")
    while common.is_number(user_input[2]) is False or common.is_number(user_input[3]) is False:
        ui.print_error_message('Error: Price and Stock value must be numbers')
        user_input = ui.get_inputs(['Title', 'manufacturer', 'purchase_date', 'durability'],"Please provide information")
        continue
    new_id = common.generate_random(table)
    new_record = [new_id] + user_input
    table += [new_record] 
    data_manager.write_table_to_file('inventory/inventory.csv', table)
    return table
Example #6
0
    def create_valid(cls, expression):
        """Verifies that the expression is correct, and if so builds an expression from it"""
        parts = expression.split(' ')

        if len(parts) != 3:
            return True

        if not common.is_number(parts[0]) or not common.is_number(parts[2]):
            return True

        if not cls.__is_valid_operator(parts[1]):
            return False

        return cls(expression)
Example #7
0
 def _check_origin(self):
     for i in range(3):
         d = self.origin[i]
         if not common.is_number(d):
             self.error('origin[%s] must be number but got %s'%(i,type(d)))
             return 1
     return 0
Example #8
0
    def append(self, token):
        if self.contents:
            prev = self.contents[-1]

            # the minus sign implies a * -1 when used by itself
            if isinstance(prev, tokens.Minus):
                # TODO: fix this the rest of the way
                if len(self.contents) == 1:
                    self.contents.pop()
                    self.contents += [tokens.Value(-1), tokens.Mult()]

            # absorb: tokens can absorb the next token from the expression if it matches a list of types
            elif isinstance(token, prev.absorbs):
                if isinstance(token, Base):
                    token = token.flatten()

                prev.absorb(token)
                return

            # implied multiplication
            elif prev.priority == token.priority == tokens.Pri.NONE:

                # negative numbers actually have implied addition
                if isinstance(token, tokens.Value)\
                    and is_number(token.value) and int(token.value) < 0:
                    self.contents.append(tokens.Plus())
                else:
                    self.contents.append(tokens.Mult())

        self.raw.append(token)
        self.contents.append(token)
def ui_remove(apartments, operations, *args):
    '''
    Handles the remove command, calling special functions for each type of argument.
    Input - the list of apartments, the list of all operations and the arguments
    Output - error messages if needed
    '''
    if len(args) == 3:
        startApartment = args[0]
        endApartment = args[2]
        if validate_remove_range(startApartment, endApartment) == True:
            try:
                remove_range_apartment(apartments, operations,
                                       int(startApartment), int(endApartment))
            except Exception:
                print("There are no apartments within the mentioned criteria.")
    else:
        if len(args) == 1:
            if is_number(args[0]):
                apartmentNumber = int(args[0])
                if is_a_valid_apartment_number(apartmentNumber):
                    try:
                        remove_apartment(apartments, operations,
                                         apartmentNumber)
                    except Exception:
                        print(
                            "There is no apartment with the mentioned number.")
            else:
                transactionType = args[0]
                if is_a_valid_transaction(transactionType):
                    try:
                        remove_type(apartments, operations, args[0])
                    except Exception:
                        print("There is no transaction of that type.")
        else:
            print("Incorrect command.")
Example #10
0
    def save_data(self, data_source: dict):
        """
        Save new data to the DB, if it doesn't already exist (i.e. No duplicate data)
        :param data_source:
        :return: none
        """
        try:
            print('Database is updating...')
            new_list = []
            for k, v in data_source.items():
                new_row = [k]
                for nest_key, nest_value in v.items():
                    # If it's 'M', insert a '' into database.
                    if is_number(nest_value):
                        new_row.append(nest_value)
                    else:
                        new_row.append('')
                new_list.append(tuple(new_row))

            with DBOperations(self.db_name) as cursor:
                sql_save_data = """INSERT OR IGNORE INTO samples (sample_date,max_temp,min_temp,avg_temp) VALUES (?,?,?,
                ?); """
                for list_item in new_list:
                    cursor.execute(sql_save_data, list_item)
            print('Database updated.')
        except Exception as e:
            self.logger.error(e)
Example #11
0
    def run(self):
        self.network()
        self.init_reader()
        use_cuda = int(config.get("runner.use_gpu"))
        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
        self.exe = paddle.static.Executor(place)

        init_model_path = config.get("runner.model_save_path")
        init_model_path = os.path.join(config["config_abs_dir"],
                                       init_model_path)
        logger.info("init_model_path: {}".format(init_model_path))
        for file in os.listdir(init_model_path):
            file_path = os.path.join(init_model_path, file)
            # hard code for epoch model folder
            if os.path.isdir(file_path) and is_number(file):
                self.epoch_model_path_list.append(file_path)
        if len(self.epoch_model_path_list) == 0:
            self.epoch_model_path_list.append(init_model_path)

        self.epoch_model_path_list.sort()
        logger.info("self.epoch_model_path_list: {}".format(
            self.epoch_model_path_list))
        for idx, model_path in enumerate(self.epoch_model_path_list):
            logger.info("Begin Infer Model {}".format(
                self.epoch_model_path_list[idx]))
            model_name = model_path.split("/")[-1]
            infer_res = self.run_infer(model_path, model_name)
            self.infer_result_dict["result"][model_name] = infer_res

        self.record_result()
        logger.info("Run Success, Exit.")
Example #12
0
    def append(self, token):
        if self.contents:
            prev = self.contents[-1]

            # the minus sign implies a * -1 when used by itself
            if isinstance(prev, tokens.Minus):
                # TODO: fix this the rest of the way
                if len(self.contents) == 1:
                    self.contents.pop()
                    self.contents += [tokens.Value(-1), tokens.Mult()]

            # absorb: tokens can absorb the next token from the expression if it matches a list of types
            elif isinstance(token, prev.absorbs):
                if isinstance(token, Base):
                    token = token.flatten()

                prev.absorb(token)
                return

            # implied multiplication
            elif prev.priority == token.priority == tokens.Pri.NONE:

                # negative numbers actually have implied addition
                if isinstance(token, tokens.Value)\
                    and is_number(token.value) and int(token.value) < 0:
                        self.contents.append(tokens.Plus())
                else:
                    self.contents.append(tokens.Mult())

        self.raw.append(token)
        self.contents.append(token)
Example #13
0
def index_document_pipe( pipe: Pipeline, cfg: CollectionConfig, doc: Doc ):
    """Push a document into the index"""
    # doc_id = doc[ col.id_fld ]
    doc_id = x_id(doc, cfg.id_fld)

    pipe.hset( f'{cfg.name}/docs', doc_id, json.dumps(doc) )

    for fld in cfg.text_flds:
        if fld in doc:
            text = doc[fld]
            index_text( pipe, cfg, doc_id, text)

    for fld in cfg.facet_flds:
        if fld not in doc:
            continue

        for val in as_list( doc, fld ):
            assert is_scalar(val), f"Found non scalar value ({val}) in field '{fld}' of " \
                                   f"document with id {doc_id}"

            index_facet( pipe, cfg.name, doc_id, fld, val )

    for fld in cfg.number_flds:
        if fld not in doc:
            continue

        for val in as_list(doc, fld):
            if val is None:
                continue
            assert is_number(val), f"Found non numeric value ({val}) in field '{fld}' of " \
                                   f"document with id {doc_id}"

            index_numeric(pipe, cfg.name, doc_id, fld, val)
Example #14
0
    def run(self):
        self.network()
        self.init_reader()
        use_cuda = int(config.get("runner.use_gpu"))
        place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
        self.exe = paddle.static.Executor(place)

        init_model_path = config.get("runner.model_save_path")
        for file in os.listdir(init_model_path):
            file_path = os.path.join(init_model_path, file)
            # hard code for epoch model folder
            if os.path.isdir(file_path) and is_number(file):
                self.epoch_model_path_list.append(file_path)
                self.epoch_model_name_list.append(file)

        if len(self.epoch_model_path_list) == 0:
            self.epoch_model_path_list.append(init_model_path)
            self.epoch_model_name_list.append(init_model_path)

        self.epoch_model_path_list.sort()
        self.epoch_model_name_list.sort()

        for idx, model_path in enumerate(self.epoch_model_path_list):
            logger.info("Begin Infer Model {}".format(
                self.epoch_model_name_list[idx]))
            self.run_infer(model_path, self.epoch_model_name_list[idx])
        logger.info("Run Success, Exit.")
Example #15
0
def check_substitutions(subs):
    '''Subs: UFL terminals/variable -> sympy expressions of right type'''
    if not all(is_terminal(k) or isinstance(k, Variable) for k in subs.keys()):
        return False

    # If the form is defined in terms of vars as well as terminals we inject
    # unwrapped variables
    subs.update({
        k.ufl_operands[0]: v
        for k, v in subs.items() if isinstance(k, Variable)
    })

    check_scalar = lambda k, v: k.ufl_shape == () and (is_scalar(v) or
                                                       is_number(v))

    check_vector = lambda k, v: (
        (len(k.ufl_shape) == 1 and is_vector(v)) and (k.ufl_shape[0] in
                                                      (v.rows, v.cols)))

    check_matrix = lambda k, v: len(k.ufl_shape) == 2 and k.ufl_shape == (
        v.rows, v.cols)

    check = lambda p: check_scalar(*p) or check_vector(*p) or check_matrix(*p)

    return all(map(check, subs.items()))
Example #16
0
 def _check_origin(self):
     for i in range(3):
         d = self.origin[i]
         if not common.is_number(d):
             self.error('origin[%s] must be number but got %s'%(i,type(d)))
             return 1
     return 0
Example #17
0
    def get_node(self, node):
        if common.is_number(node):
            return self.nodes[int(node)]

        for n in self.nodes:
            if n.name == node:
                return n
Example #18
0
 def ValidateData(self):
     if not self.txt_emp_id.GetValue().strip():
         self.txt_emp_id.SetFocus()
         return "Employee ID is required"
     elif not self.txt_name.GetValue().strip():
         self.txt_name.SetFocus()
         return "Employee Name is required"
     elif not self.cbo_designation.GetValue().strip():
         self.cbo_designation.SetFocus()
         return "Designation is required"
     elif not self.cbo_posting.GetValue().strip():
         self.cbo_posting.SetFocus()
         return "Posting Place is required"
     elif not self.txt_incre_amt.GetValue().strip():
         self.txt_incre_amt.SetFocus()
         return "Increment is required"
     elif not self.dpc_incre_date.GetValue():
         self.dpc_incre_date.SetFocus()
         return "Increment Date is required"
     elif not self.txt_present_basic.GetValue().strip():
         self.txt_present_basic.SetFocus()
         return "Present Basic is required"
     elif not self.dpc_print_date.GetValue():
         self.dpc_print_date.SetFocus()
         return "Print date is required"
     elif self.cbo_designation.GetValue() not in self.designation.values():
         self.cbo_designation.SetValue("")
         self.cbo_designation.SetFocus()
         return "Designation is not correct"
     elif self.cbo_posting.GetValue() not in self.posting.values():
         self.cbo_posting.SetValue("")
         self.cbo_posting.SetFocus()
         return "Posting is not Correct"
     elif not is_number(self.txt_emp_id.GetValue().strip()):
         self.txt_emp_id.SetValue("")
         self.txt_emp_id.SetFocus()
         return "Employee ID is not Numeric"
     elif not is_number(self.txt_incre_amt.GetValue().strip()):
         self.txt_incre_amt.SetValue("")
         self.txt_incre_amt.SetFocus()
         return "Increment is not Numeric"
     elif not is_number(self.txt_present_basic.GetValue().strip()):
         self.txt_present_basic.SetValue("")
         self.txt_present_basic.SetFocus()
         return "Present Basic is not Numeric"
     else:
         return "valid"
def is_a_valid_apartment_number(apartmentNumber):
    '''
    Check if a string represents a valid apartment number.
    Input - a string
    Output - True if the string is a apartment number, false otherwise
    '''
    if is_number(apartmentNumber) and int(apartmentNumber) > 0:
        return True
    print("Apartment number not valid.")
    return False
Example #20
0
 def _check_spacing(self):
     for i in range(3):
         d = self.spacing[i]
         if not common.is_number(d):
             self.error('spacing[%s] must be number but got %s'%(i,type(d)))
             return 1
         if d<=0:
             self.error('spacing[%s] must be positive number but got %s'%(i,d))
             return 1
     return 0
Example #21
0
 def _check_spacing(self):
     for i in range(3):
         d = self.spacing[i]
         if not common.is_number(d):
             self.error('spacing[%s] must be number but got %s'%(i,type(d)))
             return 1
         if d<=0:
             self.error('spacing[%s] must be positive number but got %s'%(i,d))
             return 1
     return 0
def is_a_valid_expense(amount):
    '''
    Check if a string represents a valid expense amount.
    Input - a string
    Output - True if the string is a natural number, false otherwise
    '''
    if is_number(amount) and int(amount) > 0:
        return True
    print("Expense value not valid.")
    return False
Example #23
0
 def build_marking(self, marking_string):
     marking = [0] * len(self.nodes)
     if common.is_number(marking_string):
         for i in range(0, len(self.nodes)):
             marking[i] = int(marking_string[i])
     else:
         nds = marking_string.split(",")
         for i in range(0, len(nds)):
             val = nds[i].split("=")
             node = self.get_node(val[0].strip())
             marking[node.id] = int(val[1])
     return marking
    def analyseDates(self, output=True):
        wlm_date = (int(self.settingDate[:4]), int(self.settingDate[5:]))
        #Spcial bins
        current = u'current (%s)' % self.settingDate
        since_last = u'since_last (%d-%s – %s)' % (
            wlm_date[0] - 1, str(wlm_date[1]).zfill(2), current)
        rest_of_last_year = u'rest_of_last_year (%d – %d-%s)' % (
            wlm_date[0] - 1, wlm_date[0] - 1, str(wlm_date[1]).zfill(2))
        results = {current: 0, since_last: 0, rest_of_last_year: 0}
        blanks = 0
        for k, v in self.indata.iteritems():
            date_raw = v['created']
            #skip any entries without valid monument_id or value
            if date_raw == '' or len(date_raw) < 4:
                blanks += 1
                continue
            #prepare dates
            month = 0
            if not common.is_number(date_raw[:4]):
                date = ('text', 'text')
            else:
                if len(date_raw) >= 7:
                    month = int(date_raw[5:7])
                date = (int(date_raw[:4]), month)

            #binning
            if date == wlm_date:
                #the current competition
                results[current] += 1
            elif (date[0] == wlm_date[0]
                  and date[1] < wlm_date[1]) or (date[0] == wlm_date[0] - 1
                                                 and date[1] > wlm_date[1]):
                #since last competition
                results[since_last] += 1
            elif date[0] == wlm_date[0] - 1:
                #the rest of that year
                results[rest_of_last_year] += 1
            else:
                if not str(date[0]) in results.keys():
                    results[str(date[0])] = 1
                else:
                    results[str(date[0])] += 1
        if output:
            #to simple to be outputSimple()
            f = codecs.open(u'%s_dates.csv' % self.output, 'w', 'utf-8')
            f.write('#no. dates: %d\n' % len(results))
            f.write('#no. blanks: %d\n' % blanks)
            f.write('#dates|no. images\n')
            for k, v in results.iteritems():
                f.write('%s|%d\n' % (k, v))
            f.close()
Example #25
0
 def match_name(self):
     pos = self._scanner
     parts = []
     while not self._scanner.eof() and self.is_ident(self.peek()):
         if len(parts) > 0 and parts[-1] == '_' and self.peek() == '_':
             self.fail('consecutive-underscores')
         parts.append(self.peek())
         self.next()
     name = ''.join(parts)
     if common.is_number(name):
         return token.Token(token.NUM, int(name), position=pos)
     else:
         type = KEYWORDS.get(name, token.ID)
         return token.Token(type, name, position=pos)
Example #26
0
def expr_body(expr, coordnames=DEFAULT_NAMES, **kwargs):
    '''Generate a/list of string/s that is the Cpp code for the expression'''
    if is_number(expr):
        return expr_body(sp.S(expr), **kwargs)

    if isinstance(expr, sp.Expr) and is_scalar(expr):
        # Defined in terms of some coordinates
        xyz = set(coordnames)
        xyz_used = xyz & expr.free_symbols
        assert xyz_used <= xyz

        # Recognize the constant
        if not expr.free_symbols:
            # Flag that we can be constant
            return str(expr), kwargs, True

        # Expression params which need default values
        params = (expr.free_symbols - xyz_used)
        # Substitute for x[0], x[1], ...
        expr = expr.subs(
            {x: sp.Symbol('x[%d]' % i)
             for i, x in enumerate(coordnames)},
            simultaneous=True)
        # Body
        expr = ccode(expr).replace('M_PI', 'pi')
        # Default to zero
        kwargs.update(dict((str(p), kwargs.get(str(p), 0)) for p in params))
        # Convert
        return expr, kwargs, False

    # Tensors that sympy can represent as lists
    # (1, n) to (n, 1) to list of n
    if is_vector(expr):
        expr = sum(expr.tolist(), [])
    elif is_matrix(expr):
        expr = expr.tolist()

    # Other lists
    # FIXME: Can this be implemented without returning kwargs, i.e. the
    # scalar place would modify it's arguments. For now I don't see how
    # https://stackoverflow.com/questions/45883655/is-it-always-safe-to-modify-the-kwargs-dictionary
    kwargs_ = kwargs
    is_constant_expr = True
    ans = ()
    for e in expr:
        f, kwargs_, is_constant = expr_body(e, **kwargs_)
        is_constant_expr = is_constant_expr and is_constant
        ans = ans + (f, )
    return ans, kwargs_, is_constant_expr
    def generate_box_plot(self, start_year: int, end_year: int) -> dict:
        """
        Generate a box plot by years data.
        :param end_year: starting year for box plotting
        :param start_year: ending year for line plotting
        :return: returns the generated box plot images' saving paths class instance
        """
        try:
            print('Generate a BOX PLOT between years[{0}-{1}]...'.format(
                start_year, end_year))

            my_db = DBOperations('weather.sqlite')
            years_data_list = []
            for current_year in range(start_year, end_year + 1):
                years_data_list.extend(my_db.fetch_data(current_year))

            monthly_weather_data = {
            }  # format: [1:[Jan temps],2:[Feb temps],..,12:[Dec temps]]
            for month in range(1, 13):
                if month not in monthly_weather_data:
                    monthly_weather_data[month] = []

            for item in years_data_list:
                if is_number(item[5]):
                    monthly_weather_data[int(item[1][5:7])].append(
                        float(item[5]))

            plot_title = 'Monthly Temperature Distribution for: ' + str(
                start_year) + ' to ' + str(end_year)
            plt.boxplot(monthly_weather_data.values(), sym="o", whis=1.5)
            plt.xlabel('Month')
            plt.ylabel('Temperature (Celsius)')
            plt.title(plot_title)
            file_name = str(start_year) + '_to_' + str(end_year) + '.png'

            # Create new directory
            output_dir = "images"
            mkdir_p(output_dir)
            file_path = '{0}/{1}'.format(output_dir, file_name)
            self.box_plot_path_saving_dict[str(start_year) + '-' +
                                           str(end_year)] = file_path

            plt.savefig(file_path)
            plt.show()
            return self.box_plot_path_saving_dict
        except Exception as e:
            self.logger.error(e)
 def analyseDates(self, output=True):
     wlm_date = (int(self.settingDate[:4]), int(self.settingDate[5:]))
     #Spcial bins
     current = u'current (%s)' %self.settingDate
     since_last = u'since_last (%d-%s – %s)' %(wlm_date[0]-1, str(wlm_date[1]).zfill(2), current)
     rest_of_last_year = u'rest_of_last_year (%d – %d-%s)' %(wlm_date[0]-1, wlm_date[0]-1,str(wlm_date[1]).zfill(2))
     results = {current:0, since_last:0, rest_of_last_year:0}
     blanks = 0
     for k,v in self.indata.iteritems():
         date_raw = v['created']
         #skip any entries without valid monument_id or value
         if date_raw == '' or len(date_raw)<4:
             blanks +=1
             continue
         #prepare dates
         month = 0
         if not common.is_number(date_raw[:4]):
             date = ('text','text')
         else:
             if len(date_raw) >= 7:
                     month = int(date_raw[5:7])
             date = (int(date_raw[:4]),month)
         
         #binning
         if date == wlm_date:
             #the current competition
             results[current] += 1
         elif (date[0] == wlm_date[0] and date[1] < wlm_date[1]) or (date[0] == wlm_date[0]-1 and date[1] > wlm_date[1]):
             #since last competition
             results[since_last] += 1
         elif date[0] == wlm_date[0]-1:
             #the rest of that year
             results[rest_of_last_year] += 1
         else:
             if not str(date[0]) in results.keys():
                 results[str(date[0])] = 1
             else:
                 results[str(date[0])] += 1
     if output:
         #to simple to be outputSimple()
         f = codecs.open(u'%s_dates.csv' %self.output, 'w', 'utf-8')
         f.write('#no. dates: %d\n' %len(results))
         f.write('#no. blanks: %d\n' %blanks)
         f.write('#dates|no. images\n')
         for k, v in results.iteritems():
             f.write('%s|%d\n' %(k, v))
         f.close()
def main():
    counts = defaultdict(int)

    for line in sys.stdin:
        review = json.loads(line)
        if 'rating' not in review:
            continue
        rating = review['rating']

        if not is_number(rating):
            continue

        counts[rating] += 1

    print '"rating","count"'
    for rating, count in counts.iteritems():
        print '"{0}","{1}"'.format(rating, count)
    def generate_line_plot(self, specific_year: int,
                           specific_month: int) -> dict:
        """
        Generate a line plot by month data.
        :param specific_month: the chosen month for line plotting
        :param specific_year: the chosen year for line plotting
        :return: returns the generated line plot images' saving paths class instance
        """
        try:
            print('Generate a Line PLOT for [{0}-{1}]...'.format(
                specific_year, specific_month))
            month_string_list = [
                'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',
                'Oct', 'Nov', 'Dec'
            ]
            my_db = DBOperations('weather.sqlite')
            specific_timestamp = []  # 2020-12-01
            specific_month_data = []

            month_data = my_db.fetch_data(specific_year, specific_month)
            for item in month_data:
                if is_number(item[5]):
                    specific_timestamp.append(float(item[1][-2:]))
                    specific_month_data.append(float(item[5]))

            plt.plot(specific_timestamp, specific_month_data)
            plt.xlabel('Day')
            plt.ylabel('Temperature (Celsius)')
            plot_title = 'Daily Temperature Distribution for: ' + month_string_list[
                specific_month - 1] + ' ' + str(specific_year)
            plt.title(plot_title)
            file_name = str(specific_year) + '-' + str(specific_month) + '.png'

            # Create new directory
            output_dir = "images"
            mkdir_p(output_dir)
            file_path = '{0}/{1}'.format(output_dir, file_name)

            self.line_plot_path_saving_dict[str(specific_year) + '-' +
                                            str(specific_month)] = file_path
            plt.savefig(file_path)
            plt.show()

            return self.line_plot_path_saving_dict
        except Exception as e:
            self.logger.error(e)
def ui_filter(apartments, operations, *args):
    '''
    Handles the filter command.
    Input - the list of apartments, the list of operation and the argument as a string
    Output - an error message if needed or calls the valid sub-func
    '''
    if len(args) == 1:
        oldApartments = deepcopy(apartments)
        operations.append(('filter', oldApartments))
        if is_number(args[0]):
            if is_a_valid_expense(args[0]):
                filter_value(apartments, int(args[0]))
        else:
            if is_a_valid_transaction(args[0]):
                filter_type(apartments, args[0])
    else:
        print("Incorrect command.")
Example #32
0
def add(table):
    """
    Asks user for input and adds it into the table.

    Args:
        table: table to add new record to

    Returns:
        Table with a new record
    """
    user_input = ui.get_inputs(['Title', 'manufacturer', 'price', 'in_stock'],"Please provide information")
    while common.is_number(user_input[2]) is false:
        continue
    new_id = common.generate_random(table)
    new_record = [new_id] + user_input
    table += [new_record] 
    data_manager.write_table_to_file('store/games.csv', table)
    return table
Example #33
0
    def number(self, dot=True, test=False, inc=True):
        num = ''
        first = True
        pos = self.pos
        while self.more(pos):
            char = self.source[pos]
            if char == '-' and first: pass
            elif not char.isdigit():
                break

            first = False
            num += char
            pos += 1

        if char == '.' and dot:
            num += '.'
            pos += 1

            self.pos, tmp = pos, self.pos
            try:
                num += str(self.number(dot=False))
            except ParseError:
                pass

            pos, self.pos = self.pos, tmp

        if inc and not test: self.pos = pos

        if is_number(num):
            if test: return True
            try:
                n = int(num)
            except ValueError:
                n = float(num)

            return n
        else:
            if test: return False
            lines = self.source[:pos]
            line = lines.count('\n') + 1
            col = max(self.pos - lines.rfind('\n'), 0)
            raise ParseError('invalid number ending at {}:{}: {}'.format(
                line, col, num))
Example #34
0
    def number(self, dot=True, test=False, inc=True):
        num = ''
        first = True
        pos = self.pos
        while self.more(pos):
            char = self.source[pos]
            if char == '-' and first: pass
            elif not char.isdigit():
                break

            first = False
            num += char
            pos += 1

        if char == '.' and dot:
            num += '.'
            pos += 1

            self.pos, tmp = pos, self.pos
            try:
                num += str(self.number(dot=False))
            except ParseError:
                pass

            pos, self.pos = self.pos, tmp

        if inc and not test: self.pos = pos

        if is_number(num):
            if test: return True
            try:
                n = int(num)
            except ValueError:
                n = float(num)

            return n
        else:
            if test: return False
            lines = self.source[:pos]
            line = lines.count('\n') + 1
            col = max(self.pos - lines.rfind('\n'), 0)
            raise ParseError('invalid number ending at {}:{}: {}'.format(line, col, num))
def perform_semantic_inference(cluster_collection):
    """
    This function performs semantic inference on a list of clusters given
    For each message in these clusters semantics are inferred by analyzing the token
    resp. its context.
    
    At the moment only two semantics are automatically inferred: numeric and IPv4 address
    
    TODO: Add more semantics, e.g. EOL identifier, lenght fields, ...
    """
    # Try to perform semantic inferences

    # Walk through every cluster and check messages for obvious results
    cluster = cluster_collection.get_all_cluster()
    for c in cluster:
        messages = c.get_messages()
        for message in messages:
            tokenlist = message.get_tokenlist()
            iterator = peekable(tokenlist)
            idx = 0
            while not iterator.isLast():
                # for tokenRepresentation in tokenlist:
                tokenRepresentation = iterator.next()
                # TODO: do we need to keep semantics which involve multiple cluster? e.g. sessionids?
                previous_semantics = tokenRepresentation.get_semantics()
                tokenRepresentation.set_semantics([])  # Clear existing semantics from previous run
                # for s in previous_semantics:
                #    if s.startswith("sessionid"):
                #        tokenRepresentation.add_semantic(s)
                #        break

                if "sessionid" in previous_semantics:
                    # Check if we have at least 2 messages and we are not of type Const
                    if len(messages) > 1 and c.get_format(idx) != Message.typeConst:
                        tokenRepresentation.add_semantic("sessionid")
                if "FD" in previous_semantics:
                    tokenRepresentation.add_semantic("FD")

                token = tokenRepresentation.get_token()
                # Check whether it is numeric

                try:
                    isNumber = tokenRepresentation.get_tokenType() == Message.typeText and common.is_number(token)
                except TypeError:
                    if Globals.getConfig().debug:
                        print "Error checking token {0} for number semantics".format(token)
                    isNumber = False
                if isNumber:
                    tokenRepresentation.add_semantic("numeric")
                    # c.add_semantics(idx,"numeric")
                    # print "Inferred semantic inference 'numeric' for token ", token

                # Check whether it is an IP address
                if isinstance(token, str) and common.is_ipv4(token):
                    tokenRepresentation.add_semantic("ipv4 address")
                    # Do not add to cluster unless it is valid for all c.add_semantics(idx,"ipv4 address")
                    # print "Inferred semantic inference 'ipv4 address' for token ", token

                # Check for carriage return identifiers
                # When 0d is followed by 0a we've got a CR-LF
                # Sensible? When 0d or 0a is the last token, we've got a single CR resp LF
                # In all other cases assume 0d/0a is just a hex value of the protocol
                if token == 0xD:
                    nextOne = iterator.peek()
                    if isinstance(nextOne, TokenRepresentation):
                        if nextOne.get_token() == 0xA:
                            inferred_formats = c.get_format_inference()
                            if (
                                inferred_formats[idx].getType() == Message.typeConst
                                and inferred_formats[idx + 1].getType() == Message.typeConst
                            ):
                                tokenRepresentation.add_semantic("CR")
                                # c.add_semantics(idx,"CR")
                                nextOne = iterator.next()
                                nextOne.set_semantics(["LF"])
                                # c.add_semantics(idx+1, "LF")
                                idx += 1

                idx += 1
        # Perform other tests like "is length field?"
        # explicitely iterate through all messages like stated in the paper
        # we could also postpone this to the call of 'pushToClusterSeminatics" but..

        reference_message = messages[0]
        tokenlist = reference_message.get_tokenlist()
        idx = 0
        for tokenRepresentation in tokenlist:
            if tokenRepresentation.get_tokenType() == Message.typeBinary and idx + 1 < len(tokenlist):
                ref_value = tokenRepresentation.get_token()
                if (
                    not tokenlist[idx + 1].get_tokenType() == Message.typeText
                ):  # We require that the next token is the text token in question
                    idx += 1
                    continue
                ref_next_length = tokenlist[idx + 1].get_length()
                if not ref_value == ref_next_length:  # This is no length field
                    idx += 1
                    continue
                ref_message_length = reference_message.get_length()
                is_length = True
                for message in messages:
                    cmp_value = message.get_tokenlist()[idx].get_token()
                    cmp_next_length = message.get_tokenlist()[idx + 1].get_length()
                    cmp_message_length = message.get_length()
                    try:
                        diff_val = abs(cmp_value - ref_value)
                    except TypeError:  # Could happen if a short text token is mistaken as a binary value
                        break
                    diff_next_length = abs(cmp_next_length - ref_next_length)
                    # The next line also takes total msg length differences into account. This might not be true for
                    # all protocols
                    diff_msg_length = abs(cmp_message_length - ref_message_length)

                    if Globals.getConfig().requireTotalLengthChangeForLengthField:
                        if not (diff_val == diff_next_length == diff_msg_length):
                            is_length = False
                        break
                    else:
                        if not (diff_val == diff_next_length):
                            is_length = False
                            break

                if is_length:  # set "lengthfield" semantic for every message in the cluster at the given position
                    for message in messages:  # TODO: What if there's only one message in the cluster? Sensible?
                        message.get_tokenlist()[idx].add_semantic("lengthfield")
                        c.add_semantic_for_token(idx, "lengthfield")
            idx += 1

        # Try to identify sessionid fields

        reference_message = messages[0]
        nextInFlow = reference_message.getNextInFlow()
        if nextInFlow != None and not (
            len(messages) == 1 and Globals.getConfig().sessionIDOnlyWithClustersWithMoreThanOneMessage
        ):
            tokenlist = reference_message.get_tokenlist()
            next_tokenlist = nextInFlow.get_tokenlist()
            ref_idx = 0
            for tokenRepresentation in tokenlist:
                tokType = tokenRepresentation.get_tokenType()
                # If its not a binary, it cannot be a cookie
                if tokType != Message.typeBinary:
                    ref_idx += 1
                    continue
                fmt = c.get_format(ref_idx)
                # If its a binary but const, it cannot be a cookie
                if fmt[1] == Message.typeConst:
                    ref_idx += 1
                    continue
                # Set reference value
                ref_val = tokenRepresentation.get_token()
                # Walk next flow for reference value
                next_idx = 0
                for next_tokenRepresentation in next_tokenlist:
                    # Retrieve next token type
                    nextTokType = next_tokenRepresentation.get_tokenType()
                    # If it is not a binary we don't see it as a cookie
                    if Globals.getConfig().sessionIDOnlyWithBinary:
                        if nextTokType != Message.typeBinary:
                            next_idx += 1
                            continue
                    next_cluster = nextInFlow.getCluster()
                    # Get format of comparating message
                    comp_fmt = next_cluster.get_format(next_idx)
                    # If it is const, it cannot be a sessonid
                    if comp_fmt[1] == Message.typeConst:
                        next_idx += 1
                        continue
                    # Load comparator value
                    comp_val = next_tokenRepresentation.get_token()
                    if (
                        ref_val == comp_val
                    ):  # We've got a potential hit, now compare all messages for the same idx pairs
                        isCookie = True
                        for cmp_ref_msg in messages:
                            if not isCookie:
                                break
                            if cmp_ref_msg == messages[0]:  # Skip first message (we've already checked that one
                                continue
                            cmp_ref_tok_list = cmp_ref_msg.get_tokenlist()

                            cmp_ref_val = cmp_ref_tok_list[ref_idx].get_token()
                            cmp_cmp_msg = cmp_ref_msg.getNextInFlow()
                            if cmp_cmp_msg == None:
                                isCookie = False
                            else:
                                cmp_cmp_tok_list = cmp_cmp_msg.get_tokenlist()
                                if next_idx >= len(cmp_cmp_tok_list):
                                    # Obviously "next" points to messages in different clusters
                                    # so the len might differ from the reference next cluster
                                    # used to find our reference cookie value
                                    # Therefore this cannot be a cookie
                                    isCookie = False
                                    continue
                                # Make sure the comparing token is also not constant
                                cmp_cmp_fmt = cmp_cmp_msg.getCluster().get_format(next_idx)
                                # If it is const, it cannot be a sessonid
                                if cmp_cmp_fmt == Message.typeConst:
                                    isCookie = False
                                    continue

                                # Finally compare the values
                                cmp_cmp_val = cmp_cmp_tok_list[next_idx].get_token()
                                if (cmp_ref_val != cmp_cmp_val) or (
                                    (cmp_ref_val == cmp_cmp_val) and (cmp_ref_val == ref_val)
                                ):
                                    isCookie = False
                        if isCookie:
                            # Set cookie semantic in this message and the other
                            # sessionid = uuid.uuid1()
                            for message in messages:  # Set for every message and the cluster itself
                                # message.get_tokenlist()[ref_idx].add_semantic("sessionid_{0}".format(sessionid))
                                message.get_tokenlist()[ref_idx].add_semantic("sessionid")
                                nextMsg = message.getNextInFlow()
                                # nextMsg.get_tokenlist()[next_idx].add_semantic("sessionid_{0}".format(sessionid))
                                nextMsg.get_tokenlist()[next_idx].add_semantic("sessionid")
                            c.add_semantic_for_token(ref_idx, "sessionid")
                            # c.add_semantic_for_token(ref_idx,"sessionid_{0}".format(sessionid))
                    next_idx += 1
                ref_idx += 1

        # Try to find random fields (16 bit)
        token_formats = c.get_formats()
        idx = 0
        for token_format in token_formats:
            rep, form, semantics = token_format
            if form.getType() == Message.typeVariable and rep == Message.typeBinary:
                try:
                    variance = c.getVariableStatistics()[idx].getVariance()
                except Exception:
                    pass

                if variance > 1000 and len(semantics) == 0:
                    # We've got a very high variance and no assigned semantics --> candidate for random
                    # Have a look at the last but one token
                    if idx - 1 >= 0:
                        rep, form, semantics = token_formats[idx - 1]
                        if form.getType() == Message.typeVariable and rep == Message.typeBinary:
                            stats = c.getVariableStatistics()[idx - 1]
                            if stats != None:
                                variance2 = stats.getVariance()
                            else:
                                logging.error(
                                    "Did not receive cluster statistics for token {0} (len of formats {1}, len of stats {2})".format(
                                        idx, len(token_formats), len(c.getVariableStatistics())
                                    )
                                )
                                idx += 1
                                continue

                            if variance2 > 1000 and len(semantics) == 0:
                                # Consider the two as a CRC-16
                                for message in messages:  # Set for every message and the cluster itself
                                    message.get_tokenlist()[idx - 1].add_semantic("random")
                                    message.get_tokenlist()[idx].add_semantic("random")
                                c.add_semantic_for_token(idx - 1, "random")
                                c.add_semantic_for_token(idx, "random")
            idx += 1

        # Try to find sets (valued limited in variability with lower and upper bound)
        token_formats = c.get_formats()
        idx = 0
        for token_format in token_formats:
            rep, form, semantics = token_format
            if form.getType() == Message.typeVariable:
                stats = c.getVariableStatistics()[idx]
                if stats != None:
                    distinct = stats.numberOfDistinctSamples()
                else:
                    logging.error(
                        "Did not receive cluster statistics for token {0} (len of formats {1}, len of stats {2})".format(
                            idx, len(token_formats), len(c.getVariableStatistics())
                        )
                    )
                    idx += 1
                    continue
                # How will be find out whether a number of variable values is a set or really variable?
                # We assume that there is an absolute maximum amount of distinct values which is independent
                # of the actual number of messages. However we also need to consider that when the number of
                # messages in a cluster definitily falls below the setAbsoluteMax value, we have to adapt
                # the local maximum in this cluster.
                # For the moment we take a multiplier for the number of messages (default 0.3 == 30%) and
                # assume it is a set, when both, setAbsoluteMax and the localThreshold is underrun
                # In addition we assume that we have no semantics for this token, as other semantics conflict
                # with the notion of a set

                if (
                    distinct <= Globals.getConfig().setAbsoluteMax
                    and distinct <= (c.getNumberOfMessages() * Globals.getConfig().setPercentageThreshold)
                    and len(semantics) == 0
                ):
                    for message in messages:  # Set for every message and the cluster itself
                        message.get_tokenlist()[idx].add_semantic("set")
                    c.add_semantic_for_token(idx - 1, "set")
            idx += 1
    # Push to cluster
    pushUpToCluster(cluster_collection)
Example #36
0
def findMatches(odok, wiki):
    '''
    tries to find matches between scraped items and exisiting odok items
    identified matches has the odok id added to the wiki object
    TO DO: Expand to display several alternatives
    '''
    # remove any id's which have already been identified
    matched_ids = []
    for w in wiki:
        if w['id']:
            if w['id'] in matched_ids:
                print u'id %s was matched to more than one wiki object!' % w['id']
            else:
                matched_ids.append(w['id'])
    print u'%r out of %r already matched (out of a maximum of %r)' % (len(matched_ids), len(wiki), len(odok))

    # make lists of odok titles and artists
    odok_titles = {}
    odok_artist = {}
    odok_surname = {}
    for key, o in odok.iteritems():
        if key in matched_ids:
            continue
        if o['title']:
            if o['title'] in odok_titles.keys():
                odok_titles[o['title']].append(key)
            else:
                odok_titles[o['title']] = [key, ]
        if o['artist']:
            if o['artist'] in odok_artist.keys():
                odok_artist[o['artist']].append(key)
            else:
                odok_artist[o['artist']] = [key, ]
            surname = wash(o['artist'].split(' ')[-1])
            if surname in odok_surname.keys():
                odok_surname[surname].append(key)
            else:
                odok_surname[surname] = [key, ]

    # remove any id's which have already been identified
    for w in wiki:
        if w['id']:
            continue
        wIdN = None
        wIdA = None
        wIdS = None
        match = ([], '')
        if w['namn'] in odok_titles.keys():
            wIdN = odok_titles[w['namn']]
        if w[u'skulptör'] in odok_artist.keys():
            wIdA = odok_artist[w[u'skulptör']]
        if wash(w[u'skulptör'].split(' ')[-1]) in odok_surname.keys():
            wIdS = odok_surname[wash(w[u'skulptör'].split(' ')[-1])]
        if wIdN and wIdA:  # match on both title and artist
            if len(wIdN) == 1:
                if wIdN[0] in wIdA:
                    match = ([wIdN[0]], 'double match')
                else:
                    match = ([wIdN[0]], 'title match but artist missmatch')
            else:
                for nId in wIdN:
                    if nId in wIdA:
                        match = ([nId], 'Non-unique title with artist match')
                        break
        elif wIdN:  # match on title only
            match = (wIdN, 'titel match')
        elif wIdA:  # match on artist only
            match = (wIdA, 'artist match')
        elif wIdS:  # last ditch attempt matching surname.
            match = (wIdS, 'surname match')
            # always check this of no match?
            # replace do "nice search" with ss->s
        # explicitly ask for verification for each match
        if match[0]:
            keys = match[0]
            print u'%s: (%s)' % (match[1], ' | '.join(keys))
            print u'W: "%s", "%s", (%s), "%s"' % (w[u'namn'], w[u'skulptör'], w[u'årtal'], w['plats'])
            for r in range(0, len(keys)):
                key = keys[r]
                print u'%r: "%s", "%s", (%s), "%s"' % (r, odok[key]['title'], odok[key][u'artist'], odok[key][u'year'], odok[key][u'address'])
            while True:
                inChoice = raw_input('Accept? [#/N]:')
                if inChoice == 'N' or inChoice == 'n':
                    break
                elif common.is_number(inChoice) and int(inChoice) in range(0, len(keys)):
                    w['id'] = keys[int(inChoice)]
                    break
Example #37
0
def updatesToDatabase(odok, wiki, quick=False):
    '''
    given a wiki-entry which has been matched to an odok object
    this checks whether any of the wikiinfo should be added to the odok
    object and prepares an update statement.
    setting quick to true puts any updates requiring decision making into the postponed output
    '''
    wpApi = wikiApi.WikiApi.setUpApi(user=config['w_username'],
                                     password=config['w_password'],
                                     site=config['wp_site'])
    updated = {}
    postponed = {}
    linked_artists = {}
    mapping = {u'namn': 'title', u'skulptör': 'artist', u'årtal': 'year',
               u'material': 'material', u'plats': 'address', u'header': 'district',
               u'lat': 'lat', u'lon': 'lon', u'bild': 'image', u'typ': 'type'}
    # non-trivial mappings u'namn_link': 'wiki'
    for w in wiki:
        if not w['id']:
            continue
        o = odok[w['id']]
        changes = {}
        skipped = {}
        for k, v in mapping.iteritems():
            if k not in w.keys():  # for postponed file some fields might be missing
                continue
            no_Tags, dummy = common.extractLink(w[k], kill_tags=True)
            if not no_Tags:  # skip if w[k] is empty (or only a tag)
                continue
            if (not o[v]) and no_Tags:  # trivial case of new info
                changes[v] = no_Tags
            elif o[v] and (not o[v].lower() == no_Tags.lower()):
                if quick:
                    skipped[k] = w[k]
                else:
                    # need to decide which to use
                    print u'Diff for %s (%s): %s' % (w['id'], o['title'], v)
                    print u' ödok: "%s"' % o[v]
                    print u' wiki: "%s"' % w[k]
                    while True:
                        inChoice = raw_input(u'Use wiki [Y(es)/N(o)/S(kip)]:')
                        if inChoice.lower() == u'n' or inChoice.lower() == u'no':
                            break
                        elif inChoice.lower() == u'y' or inChoice.lower() == u'yes':
                            changes[v] = no_Tags
                            break
                        elif inChoice.lower() == u's' or inChoice.lower() == u'skip':
                            skipped[k] = w[k]
                            break

        # register any artist_links so that these can be compared to existing links
        if u'skulptör_link' in w.keys() and w[u'skulptör_link']:  # postponed might not have u'skulptör_link'
            for a in w[u'skulptör_link']:
                if a in linked_artists.keys():
                    linked_artists[a].append(w['id'])
                else:
                    linked_artists[a] = [w['id'], ]

        # article_links must be checked manually since link may be depictive rather than of the actual object.
        if (u'namn_link' in w.keys() and w['namn_link']) and not o[u'wiki']:  # postponed might not have u'namn_link'
            keys = w['namn_link']
            print u'Potential title link for "%s" ("%s" on wiki)' % (o['title'], w['namn'])
            for r in range(0, len(keys)):
                print u'%r: "%s"' % (r, keys[r])
            while True:
                inChoice = raw_input('Accept? [#/N]:')
                if inChoice == 'N' or inChoice == 'n':
                    break
                elif common.is_number(inChoice) and int(inChoice) in range(0, len(keys)):
                    # NEW START
                    wdInfo = wpApi.getPageInfo(keys[int(inChoice)], debug=True)[keys[int(inChoice)]]
                    if 'wikidata' in wdInfo.keys() and wdInfo['wikidata']:  # if exists and not empty
                        changes[u'wiki'] = wdInfo['wikidata']
                    break
        # add changes
        if changes:
            updated[w['id']] = changes.copy()
        if skipped:
            postponed[w['id']] = skipped.copy()
    # end of wiki_object loop

    # Build new wikidata-module -
    # Build odok_write-module in the same spirit. Moving lots of writeToDatabase to that
    # om inte header, try page
    # plats_link?
    return (updated, postponed, linked_artists)
Example #38
0
def compareToDB(wikiObj, odokObj, wpApi, dbReadSQL, verbose=False):
    '''
    compares a listobj to equiv obj in database
    this needs to deal with links and wikitext
    this should check clash parameter

    should return (diff, log)
            diff: dict of changes (if any) otherwise NONE
            log: list of issues encountered e.g. incorrecly formated wikitext
    TODO:
        proper log for coordinates
        only care about first X decimals in coordinate
        return needed/removed links
        fotnot-name
        should anything be done with:
            * odok:u'same_as'
            * odok:u'year_cmt'
    '''
    # wikiObj.keys() = [u'typ', u'artikel', u'titel', 'clash', u'inomhus', u'material', u'döljStadsdel', u'län', u'konstnär2',
    #                   u'konstnär3', u'konstnär4', u'konstnär5', u'konstnär6', u'konstnär7', u'konstnär8', u'konstnär9',
    #                   u'döljKommun', u'lat', u'plats', u'fotnot', u'fotnot2', u'fotnot3', u'id', u'kommun',
    #                   u'bild', u'stadsdel', u'commonscat', u'fri', u'konstnär', u'lon', u'beskrivning', u'årtal', u'id-länk',
    #                   u'fotnot-namn', u'fotnot2-namn', u'fotnot3-namn', u'aka', u'page', u'lista', u'header']
    # odokObj.keys() = [u'changed', u'official_url', u'ugc', u'image', u'county', u'year', u'owner', u'commons_cat', u'id',
    #                   u'wiki', u'list', u'descr', u'title', u'lon', u'source', u'same_as', u'type', u'muni', u'material', u'free',
    #                   u'district', u'address', u'lat', u'year_cmt', u'artist', u'inside', u'created', u'cmt', u'removed']

    log = ''
    if wikiObj['clash']:
        log += u'clash with another page. Don\'t know how to resolve this. Skipping: %s\n' % wikiObj['clash']
        return (None, log)

    ## Pre-processing
    # get some more things from ODOK
    odokObj[u'linked_artists'] = dbReadSQL.findArtist(wikiObj[u'id'])
    odokObj[u'artist_links'] = []
    for a in odokObj[u'linked_artists']:
        odokObj[u'artist_links'].append(a['wiki'])
    odokObj[u'aka'] = ''
    akas = dbReadSQL.findAkas(wikiObj[u'id'])
    if akas:
        odokObj[u'aka'] = []
        for a in akas:
            odokObj[u'aka'].append(a['aka'])
        odokObj[u'aka'] = ';'.join(odokObj[u'aka'])
    if odokObj[u'wiki']:
        odokObj[u'wiki'] = odokObj[u'wiki'].upper()

    # the following is inherited from the header
    if wikiObj[u'header'][u'tidigare']:
        wikiObj[u'tidigare'] = 1
    else:
        wikiObj[u'tidigare'] = 0

    # the following may be inherited from the header
    if wikiObj[u'döljKommun']:
        wikiObj[u'kommun'] = wikiObj[u'header'][u'kommun']
    if not wikiObj[u'län']:
        wikiObj[u'län'] = wikiObj[u'header'][u'län']
    if wikiObj[u'döljStadsdel'] and not wikiObj[u'stadsdel']:  # only overwrite non existant
        wikiObj[u'stadsdel'] = wikiObj[u'header'][u'stadsdel']
    # the following are limited in their values but need mapping from wiki to odok before comparison
    if wikiObj[u'fri'].lower() == 'nej':
        wikiObj[u'fri'] = 'unfree'
    if wikiObj[u'inomhus']:
        if wikiObj[u'inomhus'].lower() == 'ja':
            wikiObj[u'inomhus'] = 1
        elif wikiObj[u'inomhus'].lower() == 'nej':
            wikiObj[u'inomhus'] = 0
        else:
            log += 'unexpected value for inside-parameter (defaulting to no): %s\n' % wikiObj[u'inomhus']
            wikiObj[u'inomhus'] = 0
    else:
        wikiObj[u'inomhus'] = 0
    if wikiObj[u'kommun']:  # need muni code
        wikiObj[u'kommun'] = dataDict.muni_name2code[wikiObj[u'kommun']]
    if wikiObj[u'län'].startswith(u'SE-'):
        wikiObj[u'län'] = wikiObj[u'län'][len(u'SE-'):]
    if wikiObj[u'lat'] == '':
        wikiObj[u'lat'] = None
    else:
        if len(wikiObj[u'lat']) > 16:
            wikiObj[u'lat'] = '%.13f' % float(wikiObj[u'lat'])
        wikiObj[u'lat'] = wikiObj[u'lat'].strip('0')  # due to how numbers are stored
    if wikiObj[u'lon'] == '':
        wikiObj[u'lon'] = None
    else:
        if len(wikiObj[u'lon']) > 16:
            wikiObj[u'lon'] = '%.13f' % float(wikiObj[u'lon'])
        wikiObj[u'lon'] = wikiObj[u'lon'].strip('0')  # due to how numbers are stored
    if wikiObj[u'årtal'] == '':
        wikiObj[u'årtal'] = None

    # Deal with artists (does not deal with order of artists being changed):
    artist_param = [u'konstnär', u'konstnär2', u'konstnär3',
                    u'konstnär4', u'konstnär5', u'konstnär6',
                    u'konstnär7', u'konstnär8', u'konstnär9']
    wikiObj[u'artists'] = ''
    artists_links = {}
    for a in artist_param:
        if wikiObj[a]:
            (w_text, w_links) = unwiki(wikiObj[a])
            wikiObj[u'artists'] = u'%s%s;' % (wikiObj[u'artists'], w_text)
            if w_links:
                artists_links[w_text] = w_links[0]
    if wikiObj[u'artists']:
        wikiObj[u'artists'] = wikiObj[u'artists'][:-1]  # trim trailing ;

    ## dealing with links:
    links = artists_links.values()
    if wikiObj[u'artikel']:
        if u'#' in wikiObj[u'artikel']:
            log += u'link to section: %s\n' % wikiObj[u'artikel']
        else:
            links.append(wikiObj[u'artikel'])
    if links:
        links = wpApi.getPageInfo(links)
        for k, v in links.iteritems():
            if u'disambiguation' in v.keys():
                log += u'link to disambigpage: %s\n' % k
                links[k] = ''
            elif u'wikidata' in v.keys():
                links[k] = v[u'wikidata']
            else:
                links[k] = ''
    else:
        links = {}
    # Stick wikidata back into parameters
    if wikiObj[u'artikel']:
        if u'#' not in wikiObj[u'artikel']:
            wikiObj[u'artikel'] = links.pop(wikiObj[u'artikel'])
        else:
            wikiObj[u'artikel'] = ''
    wikiObj[u'artist_links'] = links.values()

    ## Main-process
    diff = {}
    # easy to compare {wiki:odok}
    trivial_params = {u'typ': u'type',
                      u'material': u'material',
                      u'id-länk': u'official_url',
                      u'fri': u'free',
                      u'inomhus': u'inside',
                      u'artists': u'artist',
                      u'årtal': u'year',
                      u'commonscat': u'commons_cat',
                      u'beskrivning': u'descr',
                      u'bild': u'image',
                      u'titel': u'title',
                      u'aka': u'aka',
                      u'artikel': u'wiki',
                      u'list': u'list',
                      u'plats': u'address',
                      u'län': u'county',
                      u'kommun': u'muni',
                      u'stadsdel': u'district',
                      u'tidigare': u'removed',
                      u'lat': u'lat',
                      u'lon': u'lon',
                      u'fotnot': u'cmt'}

    for k, v in trivial_params.iteritems():
        (w_text, w_links) = unwiki(wikiObj[k])
        if not (w_text == odokObj[v]):
            diff[v] = {'new': w_text, 'old': odokObj[v]}
            if verbose:
                print u'%s:"%s"    <--->   %s:"%s"' % (k, w_text, v, odokObj[v])

    ## Needing separate treatment
    # comparing artist_links: u'artist_links':u'artist_links'
    artist_diff = {'+': [], '-': []}
    artist_links = list(set(wikiObj[u'artist_links'])-set(odokObj[u'artist_links']))
    if artist_links and len(''.join(artist_links)) > 0:
        artist_diff['+'] = artist_links[:]  # slice to clone the list
    artist_links = list(set(odokObj[u'artist_links'])-set(wikiObj[u'artist_links']))
    if artist_links and len(''.join(artist_links)) > 0:
        artist_diff['-'] = artist_links[:]  # slice to clone the list
    # handler can only deal with new artists
    if len(artist_diff['-']) == 0 and len(artist_diff['+']) > 0:
        artIds = dbReadSQL.getArtistByWiki(artist_diff['+'])  # list of id:{'first_name', 'last_name', 'wiki', 'birth_date', 'death_date', 'birth_year', 'death_year'}
        newArtistLinks = []
        for k, v in artIds.iteritems():
            artist_diff['+'].remove(v['wiki'])
            newArtistLinks.append(k)
        if len(newArtistLinks) > 0:
            diff[u'artist_links'] = {'new': newArtistLinks, 'old': []}
    # output remaining to log
    for k, v in artist_diff.iteritems():
        if len(v) > 0:
            log += u'difference in artist links, linkdiff%s: %s\n' % (k, ';'.join(v))

    ## akas
    if 'aka' not in diff.keys():
        pass
    elif sorted(diff['aka']['new'].split(';')) == sorted(diff['aka']['old'].split(';')):
        del(diff['aka'])
    else:
        aka_diff = {'+': [], '-': []}
        aka_list = list(set(diff['aka']['new'].split(';'))-set(diff['aka']['old'].split(';')))
        if aka_list and len(''.join(aka_list)) > 0:
            aka_diff['+'] = aka_list[:]  # slice to clone the list
        aka_list = list(set(diff['aka']['old'].split(';'))-set(diff['aka']['new'].split(';')))
        if aka_list and len(''.join(aka_list)) > 0:
            aka_diff['-'] = aka_list[:]  # slice to clone the list
        # handler can only deal with new akas
        if len(aka_diff['-']) == 0 and len(aka_diff['+']) > 0:
            diff[u'aka_list'] = {'new': aka_diff['+'], 'old': []}
            del(aka_diff['+'])
        # output remaining to log
        for k, v in aka_diff.iteritems():
            if len(v) > 0:
                log += u'difference in akas, diff%s: %s\n' % (k, ';'.join(v))
        # remove these for now
        del(diff['aka'])

    ## Post-processing
    # fotnot-namn without fotnot - needs to look-up fotnot for o:cmt
    if wikiObj[u'fotnot-namn'] and not wikiObj[u'fotnot']:
        log += u'fotnot-namn so couldn\'t compare, fotnot-namn: %s\n' % wikiObj[u'fotnot-namn']
        if u'cmt' in diff.keys():
            del diff[u'cmt']

    # free defaults to unfree in wiki but not necessarily in db
    if 'free' in diff.keys() and diff['free']['new'] == '':
        if diff['free']['old'] == 'unfree':
            diff.pop('free')

    # Years which are not plain numbers cannot be sent to db
    if 'year' in diff.keys():
        if not common.is_int(diff['year']['new']):
            year = diff.pop('year')
            log += u'Non-integer year: %s\n' % year['new']

    # lat/lon reqires an extra touch as only decimal numbers and nones may be sent to db
    if 'lat' in diff.keys():
        if not diff['lat']['new']:
            # if new = None
            pass
        elif not common.is_number(diff['lat']['new']):
            lat = diff.pop('lat')
            log += u'Non-decimal lat: %s\n' % lat['new']
    if 'lon' in diff.keys():
        if not diff['lon']['new']:
            pass
        elif not common.is_number(diff['lon']['new']):
            lat = diff.pop('lon')
            log += u'Non-decimal lon: %s\n' % diff['lon']['new']

    # Basic validation of artist field:
    if 'artist' in diff.keys():
        # check that number of artists is the same
        if '[' in diff['artist']['old']:
            artist = diff.pop('artist')
            log += u'cannot deal with artists which include group affilitations: %s --> %s\n' % (artist['old'], artist['new'])
        elif (len(diff['artist']['old'].split(';')) != len(diff['artist']['new'].split(';'))) and (len(diff['artist']['old']) > 0):
            # if not the same number when there were originally some artists
            artist = diff.pop('artist')
            log += u'difference in number of artists: %s --> %s\n' % (artist['old'], artist['new'])

    # Unstripped refrences
    for k in diff.keys():
        if k in (u'official_url', u'inside', u'removed'):  # not strings or ok to have http
            continue
        if diff[k]['new'] and 'http' in diff[k]['new']:
            val = diff.pop(k)
            log += u'new value for %s seems to include a url: %s --> %s\n' % (k, val['old'], val['new'])

    return (diff, log)
def perform_semantic_inference(cluster_collection):
    """
    This function performs semantic inference on a list of clusters given
    For each message in these clusters semantics are inferred by analyzing the token
    resp. its context.
    
    At the moment only two semantics are automatically inferred: numeric and IPv4 address
    
    TODO: Add more semantics, e.g. EOL identifier, lenght fields, ...
    """
    # Try to perform semantic inferences

    # Walk through every cluster and check messages for obvious results
    cluster = cluster_collection.get_all_cluster()
    for c in cluster:
        messages = c.get_messages()
        for message in messages:
            tokenlist = message.get_tokenlist()
            iterator = peekable(tokenlist)
            idx = 0
            while not iterator.isLast():
                #for tokenRepresentation in tokenlist:
                tokenRepresentation = iterator.next()
                # TODO: do we need to keep semantics which involve multiple cluster? e.g. sessionids?
                previous_semantics = tokenRepresentation.get_semantics()
                tokenRepresentation.set_semantics(
                    [])  # Clear existing semantics from previous run
                #for s in previous_semantics:
                #    if s.startswith("sessionid"):
                #        tokenRepresentation.add_semantic(s)
                #        break

                if "sessionid" in previous_semantics:
                    # Check if we have at least 2 messages and we are not of type Const
                    if len(messages) > 1 and c.get_format(
                            idx) != Message.typeConst:
                        tokenRepresentation.add_semantic("sessionid")
                if "FD" in previous_semantics:
                    tokenRepresentation.add_semantic("FD")

                token = tokenRepresentation.get_token()
                # Check whether it is numeric

                try:
                    isNumber = tokenRepresentation.get_tokenType(
                    ) == Message.typeText and common.is_number(token)
                except TypeError:
                    if Globals.getConfig().debug:
                        print "Error checking token {0} for number semantics".format(
                            token)
                    isNumber = False
                if isNumber:
                    tokenRepresentation.add_semantic("numeric")
                    #c.add_semantics(idx,"numeric")
                    #print "Inferred semantic inference 'numeric' for token ", token

                # Check whether it is an IP address
                if isinstance(token, str) and common.is_ipv4(token):
                    tokenRepresentation.add_semantic("ipv4 address")
                    # Do not add to cluster unless it is valid for all c.add_semantics(idx,"ipv4 address")
                    #print "Inferred semantic inference 'ipv4 address' for token ", token

                # Check for carriage return identifiers
                # When 0d is followed by 0a we've got a CR-LF
                # Sensible? When 0d or 0a is the last token, we've got a single CR resp LF
                # In all other cases assume 0d/0a is just a hex value of the protocol
                if token == 0xd:
                    nextOne = iterator.peek()
                    if isinstance(nextOne, TokenRepresentation):
                        if nextOne.get_token() == 0xa:
                            inferred_formats = c.get_format_inference()
                            if inferred_formats[idx].getType(
                            ) == Message.typeConst and inferred_formats[
                                    idx + 1].getType() == Message.typeConst:
                                tokenRepresentation.add_semantic("CR")
                                #c.add_semantics(idx,"CR")
                                nextOne = iterator.next()
                                nextOne.set_semantics(["LF"])
                                #c.add_semantics(idx+1, "LF")
                                idx += 1

                idx += 1
        # Perform other tests like "is length field?"
        # explicitely iterate through all messages like stated in the paper
        # we could also postpone this to the call of 'pushToClusterSeminatics" but..

        reference_message = messages[0]
        tokenlist = reference_message.get_tokenlist()
        idx = 0
        for tokenRepresentation in tokenlist:
            if tokenRepresentation.get_tokenType(
            ) == Message.typeBinary and idx + 1 < len(tokenlist):
                ref_value = tokenRepresentation.get_token()
                if not tokenlist[idx + 1].get_tokenType(
                ) == Message.typeText:  # We require that the next token is the text token in question
                    idx += 1
                    continue
                ref_next_length = tokenlist[idx + 1].get_length()
                if not ref_value == ref_next_length:  # This is no length field
                    idx += 1
                    continue
                ref_message_length = reference_message.get_length()
                is_length = True
                for message in messages:
                    cmp_value = message.get_tokenlist()[idx].get_token()
                    cmp_next_length = message.get_tokenlist()[idx +
                                                              1].get_length()
                    cmp_message_length = message.get_length()
                    try:
                        diff_val = abs(cmp_value - ref_value)
                    except TypeError:  # Could happen if a short text token is mistaken as a binary value
                        break
                    diff_next_length = abs(cmp_next_length - ref_next_length)
                    # The next line also takes total msg length differences into account. This might not be true for
                    # all protocols
                    diff_msg_length = abs(cmp_message_length -
                                          ref_message_length)

                    if Globals.getConfig(
                    ).requireTotalLengthChangeForLengthField:
                        if not (diff_val == diff_next_length ==
                                diff_msg_length):
                            is_length = False
                        break
                    else:
                        if not (diff_val == diff_next_length):
                            is_length = False
                            break

                if is_length:  # set "lengthfield" semantic for every message in the cluster at the given position
                    for message in messages:  # TODO: What if there's only one message in the cluster? Sensible?
                        message.get_tokenlist()[idx].add_semantic(
                            "lengthfield")
                        c.add_semantic_for_token(idx, "lengthfield")
            idx += 1

        # Try to identify sessionid fields

        reference_message = messages[0]
        nextInFlow = reference_message.getNextInFlow()
        if nextInFlow != None and not (
                len(messages) == 1 and Globals.getConfig(
                ).sessionIDOnlyWithClustersWithMoreThanOneMessage):
            tokenlist = reference_message.get_tokenlist()
            next_tokenlist = nextInFlow.get_tokenlist()
            ref_idx = 0
            for tokenRepresentation in tokenlist:
                tokType = tokenRepresentation.get_tokenType()
                # If its not a binary, it cannot be a cookie
                if tokType != Message.typeBinary:
                    ref_idx += 1
                    continue
                fmt = c.get_format(ref_idx)
                # If its a binary but const, it cannot be a cookie
                if fmt[1] == Message.typeConst:
                    ref_idx += 1
                    continue
                # Set reference value
                ref_val = tokenRepresentation.get_token()
                # Walk next flow for reference value
                next_idx = 0
                for next_tokenRepresentation in next_tokenlist:
                    # Retrieve next token type
                    nextTokType = next_tokenRepresentation.get_tokenType()
                    # If it is not a binary we don't see it as a cookie
                    if Globals.getConfig().sessionIDOnlyWithBinary:
                        if nextTokType != Message.typeBinary:
                            next_idx += 1
                            continue
                    next_cluster = nextInFlow.getCluster()
                    # Get format of comparating message
                    comp_fmt = next_cluster.get_format(next_idx)
                    # If it is const, it cannot be a sessonid
                    if comp_fmt[1] == Message.typeConst:
                        next_idx += 1
                        continue
                    # Load comparator value
                    comp_val = next_tokenRepresentation.get_token()
                    if ref_val == comp_val:  # We've got a potential hit, now compare all messages for the same idx pairs
                        isCookie = True
                        for cmp_ref_msg in messages:
                            if not isCookie:
                                break
                            if cmp_ref_msg == messages[
                                    0]:  # Skip first message (we've already checked that one
                                continue
                            cmp_ref_tok_list = cmp_ref_msg.get_tokenlist()

                            cmp_ref_val = cmp_ref_tok_list[ref_idx].get_token()
                            cmp_cmp_msg = cmp_ref_msg.getNextInFlow()
                            if cmp_cmp_msg == None:
                                isCookie = False
                            else:
                                cmp_cmp_tok_list = cmp_cmp_msg.get_tokenlist()
                                if next_idx >= len(cmp_cmp_tok_list):
                                    # Obviously "next" points to messages in different clusters
                                    # so the len might differ from the reference next cluster
                                    # used to find our reference cookie value
                                    # Therefore this cannot be a cookie
                                    isCookie = False
                                    continue
                                # Make sure the comparing token is also not constant
                                cmp_cmp_fmt = cmp_cmp_msg.getCluster(
                                ).get_format(next_idx)
                                # If it is const, it cannot be a sessonid
                                if cmp_cmp_fmt == Message.typeConst:
                                    isCookie = False
                                    continue

                                # Finally compare the values
                                cmp_cmp_val = cmp_cmp_tok_list[
                                    next_idx].get_token()
                                if (cmp_ref_val != cmp_cmp_val) or (
                                    (cmp_ref_val == cmp_cmp_val) and
                                    (cmp_ref_val == ref_val)):
                                    isCookie = False
                        if isCookie:
                            # Set cookie semantic in this message and the other
                            #sessionid = uuid.uuid1()
                            for message in messages:  # Set for every message and the cluster itself
                                #message.get_tokenlist()[ref_idx].add_semantic("sessionid_{0}".format(sessionid))
                                message.get_tokenlist()[ref_idx].add_semantic(
                                    "sessionid")
                                nextMsg = message.getNextInFlow()
                                #nextMsg.get_tokenlist()[next_idx].add_semantic("sessionid_{0}".format(sessionid))
                                nextMsg.get_tokenlist()[next_idx].add_semantic(
                                    "sessionid")
                            c.add_semantic_for_token(ref_idx, "sessionid")
                            #c.add_semantic_for_token(ref_idx,"sessionid_{0}".format(sessionid))
                    next_idx += 1
                ref_idx += 1

        # Try to find random fields (16 bit)
        token_formats = c.get_formats()
        idx = 0
        for token_format in token_formats:
            rep, form, semantics = token_format
            if form.getType(
            ) == Message.typeVariable and rep == Message.typeBinary:
                try:
                    variance = c.getVariableStatistics()[idx].getVariance()
                except Exception:
                    pass

                if variance > 1000 and len(semantics) == 0:
                    # We've got a very high variance and no assigned semantics --> candidate for random
                    # Have a look at the last but one token
                    if idx - 1 >= 0:
                        rep, form, semantics = token_formats[idx - 1]
                        if form.getType(
                        ) == Message.typeVariable and rep == Message.typeBinary:
                            stats = c.getVariableStatistics()[idx - 1]
                            if stats != None:
                                variance2 = stats.getVariance()
                            else:
                                logging.error(
                                    "Did not receive cluster statistics for token {0} (len of formats {1}, len of stats {2})"
                                    .format(idx, len(token_formats),
                                            len(c.getVariableStatistics())))
                                idx += 1
                                continue

                            if variance2 > 1000 and len(semantics) == 0:
                                # Consider the two as a CRC-16
                                for message in messages:  # Set for every message and the cluster itself
                                    message.get_tokenlist()[
                                        idx - 1].add_semantic("random")
                                    message.get_tokenlist()[idx].add_semantic(
                                        "random")
                                c.add_semantic_for_token(idx - 1, "random")
                                c.add_semantic_for_token(idx, "random")
            idx += 1

        # Try to find sets (valued limited in variability with lower and upper bound)
        token_formats = c.get_formats()
        idx = 0
        for token_format in token_formats:
            rep, form, semantics = token_format
            if form.getType() == Message.typeVariable:
                stats = c.getVariableStatistics()[idx]
                if stats != None:
                    distinct = stats.numberOfDistinctSamples()
                else:
                    logging.error(
                        "Did not receive cluster statistics for token {0} (len of formats {1}, len of stats {2})"
                        .format(idx, len(token_formats),
                                len(c.getVariableStatistics())))
                    idx += 1
                    continue
                # How will be find out whether a number of variable values is a set or really variable?
                # We assume that there is an absolute maximum amount of distinct values which is independent
                # of the actual number of messages. However we also need to consider that when the number of
                # messages in a cluster definitily falls below the setAbsoluteMax value, we have to adapt
                # the local maximum in this cluster.
                # For the moment we take a multiplier for the number of messages (default 0.3 == 30%) and
                # assume it is a set, when both, setAbsoluteMax and the localThreshold is underrun
                # In addition we assume that we have no semantics for this token, as other semantics conflict
                # with the notion of a set

                if (distinct <= Globals.getConfig().setAbsoluteMax
                        and distinct <=
                    (c.getNumberOfMessages() *
                     Globals.getConfig().setPercentageThreshold)
                        and len(semantics) == 0):
                    for message in messages:  # Set for every message and the cluster itself
                        message.get_tokenlist()[idx].add_semantic("set")
                    c.add_semantic_for_token(idx - 1, "set")
            idx += 1
    # Push to cluster
    pushUpToCluster(cluster_collection)
Example #40
0
def compareToDB(wikiObj, odokObj, wpApi, dbReadSQL, verbose=False):
    '''
    compares a listobj to equiv obj in database
    this needs to deal with links and wikitext
    this should check clash parameter

    should return (diff, log)
            diff: dict of changes (if any) otherwise NONE
            log: list of issues encountered e.g. incorrecly formated wikitext
    TODO:
        proper log for coordinates
        only care about first X decimals in coordinate
        return needed/removed links
        fotnot-name
        should anything be done with:
            * odok:u'same_as'
            * odok:u'year_cmt'
    '''
    # wikiObj.keys() = [u'typ', u'artikel', u'titel', 'clash', u'inomhus', u'material', u'döljStadsdel', u'län', u'konstnär2',
    #                   u'konstnär3', u'konstnär4', u'konstnär5', u'konstnär6', u'konstnär7', u'konstnär8', u'konstnär9',
    #                   u'döljKommun', u'lat', u'plats', u'fotnot', u'fotnot2', u'fotnot3', u'id', u'kommun',
    #                   u'bild', u'stadsdel', u'commonscat', u'fri', u'konstnär', u'lon', u'beskrivning', u'årtal', u'id-länk',
    #                   u'fotnot-namn', u'fotnot2-namn', u'fotnot3-namn', u'aka', u'page', u'lista', u'header']
    # odokObj.keys() = [u'changed', u'official_url', u'ugc', u'image', u'county', u'year', u'owner', u'commons_cat', u'id',
    #                   u'wiki', u'list', u'descr', u'title', u'lon', u'source', u'same_as', u'type', u'muni', u'material', u'free',
    #                   u'district', u'address', u'lat', u'year_cmt', u'artist', u'inside', u'created', u'cmt', u'removed']

    log = ''
    if wikiObj['clash']:
        log += u'clash with another page. Don\'t know how to resolve this. Skipping: %s\n' % wikiObj['clash']
        return (None, log)

    ## Pre-processing
    # get some more things from ODOK
    odokObj[u'linked_artists'] = dbReadSQL.findArtist(wikiObj[u'id'])
    odokObj[u'artist_links'] = []
    for a in odokObj[u'linked_artists']:
        odokObj[u'artist_links'].append(a['wiki'])
    odokObj[u'aka'] = ''
    akas = dbReadSQL.findAkas(wikiObj[u'id'])
    if akas:
        odokObj[u'aka'] = []
        for a in akas:
            odokObj[u'aka'].append(a['aka'])
        odokObj[u'aka'] = ';'.join(odokObj[u'aka'])
    if odokObj[u'wiki']:
        odokObj[u'wiki'] = odokObj[u'wiki'].upper()

    # the following is inherited from the header
    if wikiObj[u'header'][u'tidigare']:
        wikiObj[u'tidigare'] = 1
    else:
        wikiObj[u'tidigare'] = 0

    # the following may be inherited from the header
    if wikiObj[u'döljKommun']:
        wikiObj[u'kommun'] = wikiObj[u'header'][u'kommun']
    if not wikiObj[u'län']:
        wikiObj[u'län'] = wikiObj[u'header'][u'län']
    if wikiObj[u'döljStadsdel'] and not wikiObj[u'stadsdel']:  # only overwrite non existant
        wikiObj[u'stadsdel'] = wikiObj[u'header'][u'stadsdel']
    # the following are limited in their values but need mapping from wiki to odok before comparison
    if wikiObj[u'fri'].lower() == 'nej':
        wikiObj[u'fri'] = 'unfree'
    if wikiObj[u'inomhus']:
        if wikiObj[u'inomhus'].lower() == 'ja':
            wikiObj[u'inomhus'] = 1
        elif wikiObj[u'inomhus'].lower() == 'nej':
            wikiObj[u'inomhus'] = 0
        else:
            log += 'unexpected value for inside-parameter (defaulting to no): %s\n' % wikiObj[u'inomhus']
            wikiObj[u'inomhus'] = 0
    else:
        wikiObj[u'inomhus'] = 0
    if wikiObj[u'kommun']:  # need muni code
        wikiObj[u'kommun'] = dataDict.muni_name2code[wikiObj[u'kommun']]
    if wikiObj[u'län'].startswith(u'SE-'):
        wikiObj[u'län'] = wikiObj[u'län'][len(u'SE-'):]
    if wikiObj[u'lat'] == '':
        wikiObj[u'lat'] = None
    else:
        if len(wikiObj[u'lat']) > 16:
            wikiObj[u'lat'] = '%.13f' % float(wikiObj[u'lat'])
        wikiObj[u'lat'] = wikiObj[u'lat'].strip('0')  # due to how numbers are stored
    if wikiObj[u'lon'] == '':
        wikiObj[u'lon'] = None
    else:
        if len(wikiObj[u'lon']) > 16:
            wikiObj[u'lon'] = '%.13f' % float(wikiObj[u'lon'])
        wikiObj[u'lon'] = wikiObj[u'lon'].strip('0')  # due to how numbers are stored
    if wikiObj[u'årtal'] == '':
        wikiObj[u'årtal'] = None

    # Deal with artists (does not deal with order of artists being changed):
    artist_param = [u'konstnär', u'konstnär2', u'konstnär3',
                    u'konstnär4', u'konstnär5', u'konstnär6',
                    u'konstnär7', u'konstnär8', u'konstnär9']
    wikiObj[u'artists'] = ''
    artists_links = {}
    for a in artist_param:
        if wikiObj[a]:
            (w_text, w_links) = unwiki(wikiObj[a])
            wikiObj[u'artists'] = u'%s%s;' % (wikiObj[u'artists'], w_text)
            if w_links:
                artists_links[w_text] = w_links[0]
    if wikiObj[u'artists']:
        wikiObj[u'artists'] = wikiObj[u'artists'][:-1]  # trim trailing ;

    ## dealing with links:
    links = artists_links.values()
    if wikiObj[u'artikel']:
        if u'#' in wikiObj[u'artikel']:
            log += u'link to section: %s\n' % wikiObj[u'artikel']
        else:
            links.append(wikiObj[u'artikel'])
    if links:
        links = wpApi.getPageInfo(links)
        for k, v in links.iteritems():
            if u'disambiguation' in v.keys():
                log += u'link to disambigpage: %s\n' % k
                links[k] = ''
            elif u'wikidata' in v.keys():
                links[k] = v[u'wikidata']
            else:
                links[k] = ''
    else:
        links = {}
    # Stick wikidata back into parameters
    if wikiObj[u'artikel']:
        if u'#' not in wikiObj[u'artikel']:
            wikiObj[u'artikel'] = links.pop(wikiObj[u'artikel'])
        else:
            wikiObj[u'artikel'] = ''
    wikiObj[u'artist_links'] = links.values()

    ## Main-process
    diff = {}
    # easy to compare {wiki:odok}
    trivial_params = {u'typ': u'type',
                      u'material': u'material',
                      u'id-länk': u'official_url',
                      u'fri': u'free',
                      u'inomhus': u'inside',
                      u'artists': u'artist',
                      u'årtal': u'year',
                      u'commonscat': u'commons_cat',
                      u'beskrivning': u'descr',
                      u'bild': u'image',
                      u'titel': u'title',
                      u'aka': u'aka',
                      u'artikel': u'wiki',
                      u'list': u'list',
                      u'plats': u'address',
                      u'län': u'county',
                      u'kommun': u'muni',
                      u'stadsdel': u'district',
                      u'tidigare': u'removed',
                      u'lat': u'lat',
                      u'lon': u'lon',
                      u'fotnot': u'cmt'}

    for k, v in trivial_params.iteritems():
        (w_text, w_links) = unwiki(wikiObj[k])
        if not (w_text == odokObj[v]):
            diff[v] = {'new': w_text, 'old': odokObj[v]}
            if verbose:
                print u'%s:"%s"    <--->   %s:"%s"' % (k, w_text, v, odokObj[v])

    ## Needing separate treatment
    # comparing artist_links: u'artist_links':u'artist_links'
    artist_diff = {'+': [], '-': []}
    artist_links = list(set(wikiObj[u'artist_links'])-set(odokObj[u'artist_links']))
    if artist_links and len(''.join(artist_links)) > 0:
        artist_diff['+'] = artist_links[:]  # slice to clone the list
    artist_links = list(set(odokObj[u'artist_links'])-set(wikiObj[u'artist_links']))
    if artist_links and len(''.join(artist_links)) > 0:
        artist_diff['-'] = artist_links[:]  # slice to clone the list
    # handler can only deal with new artists
    if len(artist_diff['-']) == 0 and len(artist_diff['+']) > 0:
        artIds = dbReadSQL.getArtistByWiki(artist_diff['+'])  # list of id:{'first_name', 'last_name', 'wiki', 'birth_date', 'death_date', 'birth_year', 'death_year'}
        newArtistLinks = []
        for k, v in artIds.iteritems():
            artist_diff['+'].remove(v['wiki'])
            newArtistLinks.append(k)
        if len(newArtistLinks) > 0:
            diff[u'artist_links'] = {'new': newArtistLinks, 'old': []}
    # output remaining to log
    for k, v in artist_diff.iteritems():
        if len(v) > 0:
            log += u'difference in artist links, linkdiff%s: %s\n' % (k, ';'.join(v))

    ## akas
    if 'aka' not in diff.keys():
        pass
    elif sorted(diff['aka']['new'].split(';')) == sorted(diff['aka']['old'].split(';')):
        del(diff['aka'])
    else:
        aka_diff = {'+': [], '-': []}
        aka_list = list(set(diff['aka']['new'].split(';'))-set(diff['aka']['old'].split(';')))
        if aka_list and len(''.join(aka_list)) > 0:
            aka_diff['+'] = aka_list[:]  # slice to clone the list
        aka_list = list(set(diff['aka']['old'].split(';'))-set(diff['aka']['new'].split(';')))
        if aka_list and len(''.join(aka_list)) > 0:
            aka_diff['-'] = aka_list[:]  # slice to clone the list
        # handler can only deal with new akas
        if len(aka_diff['-']) == 0 and len(aka_diff['+']) > 0:
            diff[u'aka_list'] = {'new': aka_diff['+'], 'old': []}
            del(aka_diff['+'])
        # output remaining to log
        for k, v in aka_diff.iteritems():
            if len(v) > 0:
                log += u'difference in akas, diff%s: %s\n' % (k, ';'.join(v))
        # remove these for now
        del(diff['aka'])

    ## Post-processing
    # fotnot-namn without fotnot - needs to look-up fotnot for o:cmt
    if wikiObj[u'fotnot-namn'] and not wikiObj[u'fotnot']:
        log += u'fotnot-namn so couldn\'t compare, fotnot-namn: %s\n' % wikiObj[u'fotnot-namn']
        if u'cmt' in diff.keys():
            del diff[u'cmt']

    # free defaults to unfree in wiki but not necessarily in db
    if 'free' in diff.keys() and diff['free']['new'] == '':
        if diff['free']['old'] == 'unfree':
            diff.pop('free')

    # Years which are not plain numbers cannot be sent to db
    if 'year' in diff.keys():
        if not common.is_int(diff['year']['new']):
            year = diff.pop('year')
            log += u'Non-integer year: %s\n' % year['new']

    # lat/lon reqires an extra touch as only decimal numbers and nones may be sent to db
    if 'lat' in diff.keys():
        if not diff['lat']['new']:
            # if new = None
            pass
        elif not common.is_number(diff['lat']['new']):
            lat = diff.pop('lat')
            log += u'Non-decimal lat: %s\n' % lat['new']
    if 'lon' in diff.keys():
        if not diff['lon']['new']:
            pass
        elif not common.is_number(diff['lon']['new']):
            lat = diff.pop('lon')
            log += u'Non-decimal lon: %s\n' % diff['lon']['new']

    # Basic validation of artist field:
    if 'artist' in diff.keys():
        # check that number of artists is the same
        if '[' in diff['artist']['old']:
            artist = diff.pop('artist')
            log += u'cannot deal with artists which include group affilitations: %s --> %s\n' % (artist['old'], artist['new'])
        elif (len(diff['artist']['old'].split(';')) != len(diff['artist']['new'].split(';'))) and (len(diff['artist']['old']) > 0):
            # if not the same number when there were originally some artists
            artist = diff.pop('artist')
            log += u'difference in number of artists: %s --> %s\n' % (artist['old'], artist['new'])

    # Unstripped refrences
    for k in diff.keys():
        if k in (u'official_url', u'inside', u'removed'):  # not strings or ok to have http
            continue
        if diff[k]['new'] and 'http' in diff[k]['new']:
            val = diff.pop(k)
            log += u'new value for %s seems to include a url: %s --> %s\n' % (k, val['old'], val['new'])

    return (diff, log)
Example #41
0
def updatesToDatabase(odok, wiki, quick=False):
    '''
    given a wiki-entry which has been matched to an odok object
    this checks whether any of the wikiinfo should be added to the odok
    object and prepares an update statement.
    setting quick to true puts any updates requiring decision making into the postponed output
    '''
    wpApi = wikiApi.WikiApi.setUpApi(user=dconfig.w_username,
                                     password=dconfig.w_password,
                                     site=dconfig.wp_site)
    updated = {}
    postponed = {}
    linked_artists = {}
    mapping = {
        u'namn': 'title',
        u'skulptör': 'artist',
        u'årtal': 'year',
        u'material': 'material',
        u'plats': 'address',
        u'header': 'district',
        u'lat': 'lat',
        u'lon': 'lon',
        u'bild': 'image',
        u'typ': 'type'
    }
    # non-trivial mappings u'namn_link':'wiki'
    for w in wiki:
        if not w['id']: continue
        o = odok[w['id']]
        changes = {}
        skipped = {}
        for k, v in mapping.iteritems():
            if not k in w.keys(
            ):  # for postponed file some fields might be missing
                continue
            no_Tags, dummy = common.extractLink(w[k], kill_tags=True)
            if not no_Tags:  # skip if w[k] is empty (or only a tag)
                continue
            if (not o[v]) and no_Tags:  # trivial case of new info
                changes[v] = no_Tags
            elif o[v] and (not o[v].lower() == no_Tags.lower()):
                if quick:
                    skipped[k] = w[k]
                else:
                    # need to decide which to use
                    print u'Diff for %s (%s): %s' % (w['id'], o['title'], v)
                    print u' ödok: "%s"' % o[v]
                    print u' wiki: "%s"' % w[k]
                    while True:
                        inChoice = raw_input(u'Use wiki [Y(es)/N(o)/S(kip)]:')
                        if inChoice.lower() == u'n' or inChoice.lower(
                        ) == u'no':
                            break
                        elif inChoice.lower() == u'y' or inChoice.lower(
                        ) == u'yes':
                            changes[v] = no_Tags
                            break
                        elif inChoice.lower() == u's' or inChoice.lower(
                        ) == u'skip':
                            skipped[k] = w[k]
                            break

        # register any artist_links so that these can be compared to existing links
        if u'skulptör_link' in w.keys(
        ) and w[u'skulptör_link']:  # postponed might not have u'skulptör_link'
            for a in w[u'skulptör_link']:
                if a in linked_artists.keys():
                    linked_artists[a].append(w['id'])
                else:
                    linked_artists[a] = [
                        w['id'],
                    ]

        # article_links must be checked manually since link may be depictive rather than of the actual object.
        if (u'namn_link' in w.keys() and w['namn_link']
            ) and not o[u'wiki']:  # postponed might not have u'namn_link'
            keys = w['namn_link']
            print u'Potential title link for "%s" ("%s" on wiki)' % (
                o['title'], w['namn'])
            for r in range(0, len(keys)):
                key = keys[r]
                print u'%r: "%s"' % (r, keys[r])
            while True:
                inChoice = raw_input('Accept? [#/N]:')
                if inChoice == 'N' or inChoice == 'n':
                    break
                elif common.is_number(inChoice) and int(inChoice) in range(
                        0, len(keys)):
                    # NEW START
                    wdInfo = wpApi.getPageInfo(keys[int(inChoice)],
                                               debug=True)[keys[int(inChoice)]]
                    if 'wikidata' in wdInfo.keys(
                    ) and wdInfo['wikidata']:  # if exists and not empty
                        changes[u'wiki'] = wdInfo['wikidata']
                    break
        # add changes
        if changes:
            updated[w['id']] = changes.copy()
        if skipped:
            postponed[w['id']] = skipped.copy()
    # end of wiki_object loop

    # Build new wikidata-module -
    # Build odok_write-module in the same spirit. Moving lots of writeToDatabase to that
    # om inte header, try page
    # plats_link?
    return (updated, postponed, linked_artists)
Example #42
0
def findMatches(odok, wiki):
    '''
    tries to find matches between scraped items and exisiting odok items
    identified matches has the odok id added to the wiki object
    TO DO: Expand to display several alternatives
    '''
    # remove any id's which have already been identified
    matched_ids = []
    for w in wiki:
        if w['id']:
            if w['id'] in matched_ids:
                print u'id %s was matched to more than one wiki object!' % w[
                    'id']
            else:
                matched_ids.append(w['id'])
    print u'%r out of %r already matched (out of a maximum of %r)' % (
        len(matched_ids), len(wiki), len(odok))

    # make lists of odok titles and artists
    odok_titles = {}
    odok_artist = {}
    odok_surname = {}
    for key, o in odok.iteritems():
        if key in matched_ids: continue
        if o['title']:
            if o['title'] in odok_titles.keys():
                odok_titles[o['title']].append(key)
            else:
                odok_titles[o['title']] = [
                    key,
                ]
        if o['artist']:
            if o['artist'] in odok_artist.keys():
                odok_artist[o['artist']].append(key)
            else:
                odok_artist[o['artist']] = [
                    key,
                ]
            surname = wash(o['artist'].split(' ')[-1])
            if surname in odok_surname.keys():
                odok_surname[surname].append(key)
            else:
                odok_surname[surname] = [
                    key,
                ]

    # remove any id's which have already been identified
    for w in wiki:
        if w['id']: continue
        wIdN = None
        wIdA = None
        wIdS = None
        match = ([], '')
        if w['namn'] in odok_titles.keys():
            wIdN = odok_titles[w['namn']]
        if w[u'skulptör'] in odok_artist.keys():
            wIdA = odok_artist[w[u'skulptör']]
        if wash(w[u'skulptör'].split(' ')[-1]) in odok_surname.keys():
            wIdS = odok_surname[wash(w[u'skulptör'].split(' ')[-1])]
        if wIdN and wIdA:  # match on both title and artist
            if len(wIdN) == 1:
                if wIdN[0] in wIdA: match = ([wIdN[0]], 'double match')
                else: match = ([wIdN[0]], 'title match but artist missmatch')
            else:
                for nId in wIdN:
                    if nId in wIdA:
                        match = ([nId], 'Non-unique title with artist match')
                        break
        elif wIdN:  # match on title only
            match = (wIdN, 'titel match')
        elif wIdA:  # match on artist only
            match = (wIdA, 'artist match')
        elif wIdS:  # last ditch attempt matching surname.
            match = (wIdS, 'surname match')
            # always check this of no match?
            # replace do "nice search" with ss->s
        # explicitly ask for verification for each match
        if match[0]:
            keys = match[0]
            print u'%s: (%s)' % (match[1], ' | '.join(keys))
            print u'W: "%s", "%s", (%s), "%s"' % (w[u'namn'], w[u'skulptör'],
                                                  w[u'årtal'], w['plats'])
            for r in range(0, len(keys)):
                key = keys[r]
                print u'%r: "%s", "%s", (%s), "%s"' % (
                    r, odok[key]['title'], odok[key][u'artist'],
                    odok[key][u'year'], odok[key][u'address'])
            while True:
                inChoice = raw_input('Accept? [#/N]:')
                if inChoice == 'N' or inChoice == 'n':
                    break
                elif common.is_number(inChoice) and int(inChoice) in range(
                        0, len(keys)):
                    w['id'] = keys[int(inChoice)]
                    break