Example #1
0
def xor_strings(a, b):
    if isint(a):
        a = long_to_bytes(a)
    if isint(b):
        b = long_to_bytes(b)

    len_a = len(a)
    len_b = len(b)

    if hasattr(a, 'decode'):
        a = a.decode('latin-1')
    if hasattr(b, 'decode'):
        b = b.decode('latin-1')

    if len_a > len_b:
        b = extend_string(b, len_a)
    elif len_b > len_a:
        a = extend_string(a, len_b)

    if hasattr(a, 'encode'):
        a = a.encode()
    if hasattr(b, 'encode'):
        b = b.encode()

    long_a = bytes_to_long(a)
    long_b = bytes_to_long(b)
    result = long_a ^ long_b
    return result
Example #2
0
 def prepare_queryset_kwargs(self, field, value, negate):
     # only use 'in' or 'nin' if multiple values are specified
     if ',' in value:
         value = value.split(',')
         op = negate and 'nin' or self.op
     else:
         op = negate and 'ne' or ''
     if type(value) is list:
         for index, x in enumerate(value):
             if isint(x):
                 value[index] = int(x)
     elif isint(value):
         value = int(value)
     return {'__'.join(filter(None, [field, op])): value}
Example #3
0
def encode(s, **kwargs):
    if not kwargs['key']:
        print("Key not provided")
        return None
    if s.startswith('0x'):
        s = int(s[2:], 16)
    key = kwargs['key']
    if key.startswith('0x'):
        key = int(key[2:], 16)
    if isint(s) and isint(key):
        result = xor_int(s, key)
    else:
        result = xor_strings(s, key)
    return format_result(result, hex(result),
                         long_to_bytes(result).decode('latin-1'))
Example #4
0
def encode(s, **kwargs):
    if s.startswith('0x'):
        s = int(s[2:], 16)
    if isint(s):
        s = long_to_bytes(s)
    else:
        s = s.encode()
    return b64encode(s).decode()
Example #5
0
 def parse_listkey(self, arg):
     if isint(arg):
         relative_days = int(arg)
         listkey = relative_days + (datetime.now()-datetime(1970,1,1)).days
     else:
         listkey = arg.strip()
     
     return listkey
Example #6
0
def test_utils_all():
    from utils import isint, set_at, get_int_input
    print("testing: isint:", isint([5]))
    print("testing: set_at:", set_at("ABS!", 2, "C"))
    print(
        "testing: get_int_input:",
        get_int_input(prompt="GIVE ME IINNTT!!",
                      error_message="F**K YOU, INT I TOLD!"))
Example #7
0
 def __getitem__(self, daykey):
     assert isint(daykey) or daykey in self.extra_lists
     if daykey in self.extra_lists:
         list_ = self.extra_lists[daykey] 
     else:
         daykey= int(daykey)
         if daykey not in self.days:
             self.days[daykey] = []
         list_ = self.days[daykey]
     return list_
Example #8
0
def decode(ciphertext, **kwargs):
    plaintext = []
    if kwargs['key']:
        ciphertext = ciphertext.split(kwargs['key'])
    else:
        ciphertext = ciphertext.split()
    for char in ciphertext:
        if '-' in char and len(char) > 1:
            char = char.replace('-', '')
        if not isint(char):
            plaintext.append(char)
            continue
        plaintext.append(table[int(char[0]) - 1][int(char[1]) - 1])
    return "".join(plaintext)
Example #9
0
 def prepare_queryset_kwargs(self, field, value, negate):
     """
     Function: prepare_queryset_kwargs
     Summary: Prepara la query para los casos que se deban buscar dentro
     de una lista que posee diccionarios.
     Examples: key=[field_dict].[value]
     Attributes:
     Returns: query
     """
     # only use 'in' or 'nin' if multiple values are specified
     dict_field, vals = value.split('.')
     if ',' in vals:
         value = [x for x in vals.split(',') if x]
         for index, x in enumerate(value):
             if isint(x):
                 value[index] = int(x)
             else:
                 value[index] = x
     else:
         if isint(vals):
             vals = int(vals)
         value = [vals]
     return dict(__raw__={field + '.' + dict_field: {"$in": value}})
Example #10
0
 def on_btn_ok(self, sender):
     pnum = self.combo.get_active_text()
     if not isint(pnum):
         self.close()
         return
     files = [cfile(self.filename) for i in range(int(pnum))]
     for i, f in enumerate(files):
         f.change_base(f.base + ' - part ' + str(i + 1).zfill(2))
         f.add_subdir('split')
     if not (files[0].path_exists):
         files[0].create_path()
     subs_per_part = ceildiv(len(self.subs), int(pnum))
     for idx, f in enumerate(files):
         partfile = srtFile(f.full_path)
         partfile.write_to_file(self.subs[(idx) * subs_per_part:(idx + 1) *
                                          subs_per_part])
     self.close()
Example #11
0
    def compute(self, aitem: Union[int, PeakEventsTuple]) -> FitBead:
        "Action applied to the frame"
        if getattr(self, '_resolved', None) != getattr(self.track, 'path',
                                                       None):
            self.config = self.config.resolve(self.track.path)
            self._resolved = self.track.path

        if isint(aitem):
            bead = cast(int, aitem)
            inp = cast(PeakEvents, cast(dict, self.data)[bead])
        else:
            bead, inp = cast(PeakEventsTuple, aitem)

        events = self.__topeaks(inp)
        baseline = self.__baseline(bead, inp)
        singlestrand = self.__singlestrand(bead, inp)
        dist = self.distances(bead, events, baseline is not None, singlestrand
                              is not None)
        return self.__beadoutput(bead, events, dist, (baseline, singlestrand))
Example #12
0
def memory_str(memory, p=-1, executing_char='', sep="|", marker='', s1=2, s2=3, mode="", filter_0=True):
    result = ""
    # executing_char = f(executing_char, 1, 1)
    for i in range(len(memory)):
        cell = memory[i]
        if(mode == "int" and isint(cell)):
            cell = int(cell)
        elif(mode == "bool" and type(cell) == type(True) or mode == "boolean"):
            if(cell == False):
                cell = "F"
            else:
                cell = "T"
        if(i == p % len(memory)):
            result += f(marker + executing_char, s1, s1) + f(str(cell), s2, s2) + sep
        else:
            if(memory[i] == 0 and filter_0 and not executing_char=='o'):
                result += " "*(s1+s2) + sep
            else:
                result += f(cell, s1+s2, s1+s2) + sep
    return result
Example #13
0
def restore_or_init(net, logger, dest_dir, args):
    from_scratch = False
    if utils.isint(args.restore):
        restore_from, restore_iter = (dest_dir, args.restore)
        restore_fromthis = True
    else:
        restore_from, restore_iter = utils.parent_dir(args.restore)
        if not osp.isabs(restore_from):
            restore_from = osp.join(utils.parent_dir(dest_dir)[0], restore_from)
        restore_fromthis = False
    saved = utils.get_saves(restore_from)
    restore_iter = int(restore_iter)
    if restore_iter == -1:
        if saved:
            start_iter, iter_dir = saved[-1]
        else:
            if restore_fromthis:
                from_scratch = True
            else:
                raise ValueError('No checkpoints found in {}'.format(restore_from))
    else:
        for start_iter, iter_dir in saved:
            if start_iter == restore_iter:
                break
        else:
            if restore_iter == 0:
                from_scratch = True
            else:
                raise ValueError('Checkpoint {} not found in {}'.format(restore_iter, restore_from))
    if from_scratch:
        start_iter = 0
    if not from_scratch:
        snap_dest = osp.join(iter_dir, 'state_dict.pth')  # map to cpu in case the optim was done with different devices
        print("Restoring net and logger state from", snap_dest)
        saved_state_dict = torch.load(snap_dest, map_location=lambda storage, loc: storage)
        if hasattr(saved_state_dict,'_OrderedDict__root'):
            load_weights(net, saved_state_dict)
        else:
            net.initialize_from_file(snap_dest)
        logger.restore(iter_dir)
    return start_iter
Example #14
0
def test_hypothesis(**kwargs):
    """
    Test a hypothesis on examples. How to call:
    -  If called with no kwargs it simply checks a hypothesis on gl.current_example (normal mode)
    -  If called with kwargs = {example:i}, where i is an integer then its gets example
       i from hte database, performs check and cleans up, i.e it restores gl.current_example (debugging mode)
    -  If called with kwargs = {example:'all'} it checks a hypothesis on all seen examples
       and cleans up afterwards. (debugging mode)    
     """
    import utils
    current_example = gl.current_example # remember to clean up later 
    if kwargs == {}: # default   
        return test_default()
    else:
        if 'example' in kwargs: # perform a test with a specific example (for debugging)
            i = kwargs['example']
            if utils.isint(i):
                utils.get_example(i)
                return test_default(last_seen = current_example)
            else: # then we want to check all seen examples for correctness (argument = 'all')
                return test_all(current_example)
Example #15
0
def test_hypothesis(**kwargs):
    """
    Test a hypothesis on examples. How to call:
    -  If called with no kwargs it simply checks a hypothesis on gl.current_example (normal mode)
    -  If called with kwargs = {example:i}, where i is an integer then its gets example
       i from hte database, performs check and cleans up, i.e it restores gl.current_example (debugging mode)
    -  If called with kwargs = {example:'all'} it checks a hypothesis on all seen examples
       and cleans up afterwards. (debugging mode)    
     """
    import utils
    current_example = gl.current_example # remember to clean up later 
    if kwargs == {}: # default   
        return test_default()
    else:
        if 'example' in kwargs: # perform a test with a specific example (for debugging)
            i = kwargs['example']
            if utils.isint(i):
                utils.get_example(i)
                return test_default(last_seen = current_example)
            else: # then we want to check all seen examples for correctness (argument = 'all')
                return test_all(current_example)
Example #16
0
    def fixfootnoterefs(self):
        '''fixfootnoterefs(self): Fix the contents for a known pattern.

        Sometimes footnote references appear on their separate line above the
        line they should be on:

                       123)
            lorem ipsum

        instead of:

            lorem ipsum123)

        This causes the footnote reference to be considered as code, an so does
        the following line.'''
        todelete = list()
        for i in range(1, len(self.elements)):
            if not isinstance(self.elements[i], elements.Code):
                continue
            code = self.elements[i]
            if not len(code.lines) >= 2:
                continue
            if not (utils.isint(code.lines[0][:-1])
                    and code.lines[0][-1] == ')'):
                continue
            if not isinstance(self.elements[i - 1], elements.Text):
                continue
            prevtext = self.elements[i - 1]
            firstline = code.lines[1] + code.lines[0].lstrip()
            prevtext.addcontent(firstline)
            for line in code.lines[2:]:
                prevtext.addcontent(line)
            todelete.append(i)

        for i in reversed(todelete):
            self.elements[i:i + 1] = []
Example #17
0
def plot_feature_contributions_surgery_class(X,
                                             y,
                                             feature_index,
                                             fcs,
                                             attributes,
                                             class_of_interest,
                                             title=None):
    surgery_index = np.where(attributes == 'Q44071_snCplexoAt')[0][0]

    if (not utils.isint(X[utils.firstNotNan(
            X[:, feature_index])][feature_index]) and not utils.isfloat(
                X[utils.firstNotNan(X[:, feature_index])][feature_index])):
        values = [i for i in set(X[:, feature_index]) if not utils.isnan(i)
                  ] + [np.nan]

        x_surgery = []
        surgery_colors = []
        x_no_surgery = []
        no_surgery_colors = []
        x_nan = []
        nan_colors = []
        y_surgery = []
        y_no_surgery = []
        y_nan = []

        contributions = {}

        for i in range(X.shape[0]):

            if (feature_index in fcs[i].keys()):

                if (X[i][surgery_index] == 'S' or X[i][surgery_index] == 'Y'):
                    x_surgery.append(fcs[i][feature_index][class_of_interest])
                    y_surgery.append(values.index(X[i][feature_index]))
                    if (y[i] == class_of_interest):
                        surgery_colors.append('blue')
                    else:
                        surgery_colors.append('red')

                elif (utils.isnan(X[i][surgery_index])):
                    x_nan.append(fcs[i][feature_index][class_of_interest])
                    #this is necessary because of weird behavior when X[i][feature_index] is nan
                    #and for some reason it says that nan is not values
                    y_nan.append(len(values) - 1)
                    if (y[i] == class_of_interest):
                        nan_colors.append('blue')
                    else:
                        nan_colors.append('red')
                else:
                    x_no_surgery.append(
                        fcs[i][feature_index][class_of_interest])
                    y_no_surgery.append(values.index(X[i][feature_index]))
                    if (y[i] == class_of_interest):
                        no_surgery_colors.append('blue')
                    else:
                        no_surgery_colors.append('red')

                # if(X[i][feature_index] not in contributions.keys()):
                #     contributions[X[i][feature_index]] = [fcs[i][feature_index][class_of_interest]]
                # else:
                #     contributions[X[i][feature_index]].append(fcs[i][feature_index][class_of_interest])
        coi = str(class_of_interest)
        ax = plt.subplot(111)
        ax.scatter(x_surgery,
                   y_surgery,
                   marker='o',
                   s=60,
                   edgecolors=surgery_colors,
                   facecolors='none')
        ax.scatter(x_no_surgery,
                   y_no_surgery,
                   marker='x',
                   s=60,
                   edgecolors=no_surgery_colors,
                   facecolors='none')
        ax.scatter(x_nan,
                   y_nan,
                   marker='d',
                   s=60,
                   edgecolors=nan_colors,
                   facecolors='none')
        plt.xlabel('feature contribution')
        plt.ylabel('values of feature %r' % attributes[feature_index])
        ax.set_yticks(np.array(range(len(values) + 2)) - 1)
        ax.set_yticklabels([str('')] + values + [str('')])
        red_patch = mpatches.Patch(color='red')
        blue_patch = mpatches.Patch(color='blue')
        xmarker = mlines.Line2D([], [],
                                color='black',
                                marker='x',
                                markersize=10,
                                linestyle='None')
        omarker = mlines.Line2D([], [],
                                color='black',
                                marker='o',
                                markersize=10,
                                linestyle='None',
                                markerfacecolor='None',
                                markeredgecolor='black')
        #plt.legend(handles=[red_patch,blue_patch])

        plt.legend([red_patch, blue_patch, xmarker, omarker], [
            'Classe da instância ≠ ' + coi, 'Classe da instância = ' + coi,
            'Não passou por cirurgia', 'Passou por cirurgia'
        ],
                   numpoints=1,
                   fontsize='small')
        plt.show()

    else:

        values = sorted([
            round(i, 4) for i in (set(X[:, feature_index]))
            if not utils.isnan(i)
        ])  # + [np.nan]
        print(values)
        nan_index = values[-1] - values[-2]
        x_surgery = []
        surgery_colors = []
        x_no_surgery = []
        no_surgery_colors = []
        x_nan = []
        nan_colors = []
        y_surgery = []
        y_no_surgery = []
        y_nan = []

        for i in range(X.shape[0]):
            if (feature_index in fcs[i].keys()):
                if (X[i][surgery_index] == 'S' or X[i][surgery_index] == 'Y'):
                    x_surgery.append(fcs[i][feature_index][class_of_interest])
                    y_surgery.append((X[i][feature_index]))
                    if (y[i] == class_of_interest):
                        surgery_colors.append('blue')
                    else:
                        surgery_colors.append('red')
                elif (utils.isnan(X[i][surgery_index])):
                    x_nan.append(fcs[i][feature_index][class_of_interest])
                    #this is necessary because of weird behavior when X[i][feature_index] is nan
                    #and for some reason it says that nan is not values
                    y_nan.append(values[-1] + nan_index)
                    if (y[i] == class_of_interest):
                        nan_colors.append('blue')
                    else:
                        nan_colors.append('red')
                else:
                    x_no_surgery.append(
                        fcs[i][feature_index][class_of_interest])
                    y_no_surgery.append((X[i][feature_index]))
                    if (y[i] == class_of_interest):
                        no_surgery_colors.append('blue')
                    else:
                        no_surgery_colors.append('red')
        coi = str(class_of_interest)
        fig, ax = plt.subplots()
        ax.scatter(x_surgery,
                   y_surgery,
                   marker='o',
                   s=60,
                   facecolors='none',
                   edgecolors=surgery_colors)
        ax.scatter(x_no_surgery,
                   y_no_surgery,
                   marker='x',
                   s=60,
                   edgecolors=no_surgery_colors)
        ax.scatter(x_nan,
                   y_nan,
                   marker='d',
                   s=60,
                   facecolors='none',
                   edgecolors=nan_colors)
        fig.canvas.draw()
        labels = [''] + [item.get_text()
                         for item in ax.get_yticklabels()] + ['']
        if (values[-1] + nan_index < ax.get_yticks()[-1]):
            plt.yticks(
                [values[0] - nan_index] +
                sorted(list(ax.get_yticks()) + [values[-1] + nan_index]))
        else:
            plt.yticks([values[0] - nan_index] + sorted(
                list(ax.get_yticks()) +
                [values[-1] + nan_index, values[-1] + 2 * nan_index]))
        labels[-2] = 'nan'

        plt.xlabel('feature contribution')
        plt.ylabel('values of feature %r' % attributes[feature_index])
        ax.set_yticklabels(labels)
        red_patch = mpatches.Patch(color='red')
        blue_patch = mpatches.Patch(color='blue')
        xmarker = mlines.Line2D([], [],
                                color='black',
                                marker='x',
                                markersize=10,
                                label='Bla',
                                linestyle='None')
        omarker = mlines.Line2D([], [],
                                color='black',
                                marker='o',
                                markersize=10,
                                label='Bla',
                                linestyle='None',
                                markerfacecolor='None',
                                markeredgecolor='black')
        #plt.legend(handles=[red_patch,blue_patch])
        plt.legend([red_patch, blue_patch, xmarker, omarker], [
            'Classe da instância ≠ ' + coi, 'Classe da instância = ' + coi,
            'Não passou por cirurgia', 'Passou por cirurgia'
        ],
                   numpoints=1,
                   fontsize='small')
        plt.show()

    if (title is not None):
        plt.savefig(title)
        plt.close()
        f = open(title, 'w')
        f.write('X=' + str(X))
        f.write('\ny=' + str(y))
        f.write('\nfcs=' + str(fcs))
        f.write('\nfeatures=' + str(attributes))
        f.write('\nfeature_index=' + str(feature_index))
        f.write('\nvalues=' + str(values))
        f.write('\nx_surgery=' + str(x_surgery))
        f.write('\ny_surgery=' + str(y_surgery))
        f.write('\nsurgery_colors=' + str(surgery_colors))
        f.write('\nx_no_surgery=' + str(x_no_surgery))
        f.write('\ny_no_surgery=' + str(y_no_surgery))
        f.write('\nno_surgery_colors=' + str(no_surgery_colors))
        f.write('\nx_nan=' + str(x_nan))
        f.write('\ny_nan=' + str(y_nan))
        f.write('\nnan_colors=' + str(nan_colors))
Example #18
0
def handle_text(message):
    uid = message.from_user.id
    if message.from_user.id not in USERS:
        USERS[message.from_user.id] = u.User()

    if message.text == "Отмена" and uid in ADMINS:

        if uid in INADMINMENU:
            if INADMINMENU[uid] != '':
                INADMINMENU[uid] = ''
                markup = u.get_keyboard(["Существующие вопросы", "Добавить вопрос", "Отмена"])
                bot.send_message(message.from_user.id, "Меню администратора \n"
                                                       "(Визуальное представление меню, "
                                                       "логика и способы взаимодействия c ботом являются "
                                                       "демо-вариантами и могут быть изменены)", reply_markup=markup)
                return

            bot.send_message(message.from_user.id, "Чтобы начать "
                                                   "опрос введите команду /start", reply_markup=u.get_keyboard([]))
        return

    if message.text == "Существующие вопросы" and uid in ADMINS:
        msg = "Текущие вопросы в боте: \n\n"
        print(Questions)
        for i in range(0, len(Questions)):
            q = Questions[i]
            msg += "(№ {}) ".format(i)
            msg += '{} \n Ответы: {}\n\n'.format(q.text, ", ".join(q.answers))
        msg += "Для удаления вопроса отправьте его номер."
        INADMINMENU[uid] = "Существующие вопросы"
        markup = u.get_keyboard(["Отмена"])
        print(msg)
        bot.send_message(uid, msg, reply_markup=markup)
        return

    if message.text == "Добавить вопрос" and uid in ADMINS:
        INADMINMENU[uid] = "Добавить вопрос"
        msg = "Для добавления вопроса введите текст нового вопроса, " \
              "затем в скобках варианты через запятую, если требуется. \n\n" \
              "Пример: Введите ваш возраст (12 лет, 21 год, 45, более 50-ти)\n\n" \
              "(строгие требования к написанию вопроса относятся лишь к существующему "\
                "прототипу и в дальнейшем ввод вопросов будет упрощен)"
        markup = u.get_keyboard(["Отмена"])
        print(msg)
        bot.send_message(uid, msg, reply_markup=markup)
        return

    if uid in INADMINMENU:
        if INADMINMENU[uid] == "Существующие вопросы":
            if u.isint(message.text):
                id = int(message.text)
                Questions.remove(Questions[id])
                msg = "Вопрос удален"
                bot.send_message(uid, msg)
                # markup = u.get_keyboard(["/start"])
                # bot.send_message(message.from_user.id, "Нажмите на кнопку старт чтоб начать "
                #                                        "опрос или введите команду /start", reply_markup=markup)
                markup = u.get_keyboard(["Существующие вопросы", "Добавить вопрос", "Отмена"])
                bot.send_message(message.from_user.id, "Меню администратора \n"
                                                       "(Визуальное представление меню, "
                                                       "логика и способы взаимодействия c ботом являются "
                                                       "демо-вариантами и могут быть изменены)", reply_markup=markup)
                # INADMINMENU[uid] = ""
                return
            else:
                msg = "Для удаления вопроса отправьте его номер."
                bot.send_message(uid, msg)
                return
        if INADMINMENU[uid] == "Добавить вопрос":
            try:
                t = message.text
                tq = t.split(' (')[0]
                if len(t.split(' (')) > 1:
                    ta = t.split(' (')[1][:-1]
                    ta_arr = ta.split(', ')
                    if len(ta_arr[0]) > 0:
                        Questions.append(u.Question(tq, ta_arr))
                    else:
                        Questions.append(u.Question(tq))
                else:
                    Questions.append(u.Question(tq))
                markup = u.get_keyboard(["/start"])
                bot.send_message(uid, "Вопрос добавлен", reply_markup=markup)
                INADMINMENU[uid] = ""
            except Exception:
                bot.send_message(uid, "Пожалуйста следуйте требованиям при написании вопроса "
                                      "(строгие требования относятся лишь к существующему "
                                      "прототипу и в дальнейшем ввод вопросов будет упрощен)")
                return

    if message.text.lower() == "да":
        if len(Questions) > 0:
            USERS[message.from_user.id].question = Questions[0]
            markup = u.get_keyboard(USERS[uid].question.answers)
            bot.send_message(uid, USERS[uid].question.text, reply_markup=markup)
        else:
            markup = u.get_keyboard([])
            bot.send_message(uid, "В боте еще не заданы вопросы", reply_markup=markup)
        if len(Questions) > 1:
            USERS[uid].question = Questions[1]
            USERS[uid].q_index = 1
        else:
            USERS[uid].is_last_quest = True
        return

    if not USERS[uid].is_last_quest and USERS[message.from_user.id].question is not None:
        USERS[uid].answs.append(message.text)
        # if USERS[uid].question is None:
        #     USERS[uid].question = Questions[0]
        #     USERS[uid].q_index = 0
        markup = u.get_keyboard(USERS[uid].question.answers)
        bot.send_message(uid, USERS[uid].question.text, reply_markup=markup)
        if len(Questions) > USERS[uid].q_index + 1:
            USERS[uid].q_index += 1
            USERS[uid].question = Questions[USERS[uid].q_index]
        else:
            USERS[uid].is_last_quest = True
        return

    if USERS[uid].is_last_quest:
        USERS[uid].answs.append(message.text)
        # markup = u.get_keyboard(["/start"])
        markup = types.ReplyKeyboardRemove(selective=False)
        bot.send_message(message.from_user.id, "Спасибо, за пройденный опрос", reply_markup=u.get_keyboard([]))
        send_to_admins(USERS[message.from_user.id])
        USERS[message.from_user.id] = u.User()

    if message.text.lower() == "нет":
        markup = u.get_keyboard(["/start"])
        bot.send_message(message.from_user.id, "Нажмите на кнопку старт чтоб начать "
                                               "опрос или введите команду /start", reply_markup=markup)
        return
Example #19
0
def readData(class_name,
             class_questionnaire='Q92510',
             data_path=None,
             missing_input='none',
             dummy=False,
             transform_numeric=False,
             use_text=False,
             skip_class_questionnaire=True):
    # attributes are separated by commas (',')
    # "nan" is assigned to fields with 'N/A' or 'None'

    print('Reading data...')
    data = pd.read_csv(data_path,
                       header=0,
                       delimiter=",",
                       na_values=['N/A', 'None', 'nan', 'NAAI', 'NINA'],
                       quoting=0,
                       encoding='utf8')

    if (not transform_numeric):
        dummy = False

    #print(data.columns[-1])
    # data = data.dropna(subset=[class_name])
    # data = data.drop(np.where([e == 'NAAI' or e == 'NINA' for e in data[data.columns[-1]]])[0])
    # print(data.shape)
    data = data.drop(data.columns[data.columns.str.endswith('id')], 1)
    data = data.drop(data.columns[data.columns.str.endswith('token')], 1)
    data = (data.drop(data.columns[data.columns.str.endswith('ipaddr')], 1))
    data = (data.drop(data.columns[data.columns.str.endswith('date')], 1))
    data = (data.drop(data.columns[data.columns.str.endswith('stamp')], 1))
    #data = (data.drop(data.columns[data.columns.str.endswith('participant_code')],1))
    data = (data.drop(data.columns[data.columns.str.endswith('datLesao')], 1))
    data = (data.drop(data.columns[data.columns.str.endswith('datNasc')], 1))
    # data = (data.drop(data.columns[data.columns.str.endswith('Origem')],1))
    # data = (data.drop(data.columns[data.columns.str.endswith('Cidade')],1))
    # data = (data.drop(data.columns[data.columns.str.endswith('Estado')],1))
    # data = (data.drop(data.columns[data.columns.str.endswith('País')],1))
    # data = (data.drop(data.columns[data.columns.str.endswith('participant_code')],1))
    # data = (data.drop(data.columns[data.columns.str.endswith('Natural de')],1))

    #data = ((((data.T).drop_duplicates(keep='first')).dropna(how='all')).T)
    #dropping columns that are constant
    #data = data.loc[:,data.apply(pd.Series.nunique) != 1]

    ## data = pp.preprocess(data_path,class_name)
    n_samples = data.shape[0]
    n_features = data.shape[1]
    regex_date = re.compile(
        '(\d{4})-(\d{2})-(\d{2})\s?((\d{2}):(\d{2}):(\d{2}))?')

    treatment = np.empty(n_features, dtype='U5')

    attributes = []
    categories = []
    transformedData = []
    index = 0
    si = 0

    print('Transforming data...')
    ### representing the categories with numbers

    for attribute in data.columns:

        if skip_class_questionnaire and class_questionnaire is not None and class_questionnaire in attribute and class_name not in attribute:
            index += 1
            continue
        # else:
        # 	if class_questionnaire in attribute and 'Ombro' not in attribute:
        # 		index+=1
        # 		continue

        t = pd.factorize(data[attribute].values, sort=True)

        #temp = t[0]

        i = utils.firstNotNan(data[attribute].values)

        try:
            result = regex_date.match(data[attribute].values[i])
            if (result):
                treatment[index] = 'date'

            elif (len(t[1]) > 0.9 * n_samples and len(t[1][0]) > 50):
                # if(attribute == 'participant_code'):
                # 	temp = t[0]
                # 	treatment[index] = 'int'
                # else:
                treatment[index] = 'text'
            else:
                if (utils.isfloat(data[attribute].values[i])):
                    # index+=1
                    # continue
                    #temp = [float(x) for x in t[0]]
                    treatment[index] = 'float'

                elif (not dummy):

                    if (transform_numeric
                            or utils.isint(data[attribute].values[i])):
                        temp = t[0]
                        # if not utils.isint(data[attribute].values[i]):
                        # 	index += 1
                        # 	continue

                    else:
                        temp = data[attribute].values
                    treatment[index] = 'int'

                else:
                    treatment[index] = 'bin'

        except TypeError:

            if (utils.isfloat(data[attribute].values[i])):
                # index+=1
                # continue
                temp = np.array(data[attribute].values).reshape(-1, 1)
                treatment[index] = 'float'
            elif (utils.isint(data[attribute][i])):
                # index+=1
                # continue
                temp = (np.array(data[attribute].values) * 1).reshape(-1, 1)
                treatment[index] = 'int'
            else:
                print("could not identify type of attribute %s" % attribute)
                exit(-1)

        #treatment of class	attribute
        if (class_name in attribute):
            temp = t[0]
            treatment[index] = 'int'

        if (treatment[index] == 'float'):
            if (missing_input != 'none'):
                imp = preprocessing.Imputer(strategy=missing_input, axis=0)
                temp = imp.fit_transform(
                    X=np.array(data[attribute].values).reshape(-1, 1))
            else:
                temp = data[attribute].values
            #print(np.array(list((float(x) for x in temp))).reshape(-1,1).shape)
            transformedData.append(
                np.array(list((float(x) for x in temp))).reshape(-1, 1))

        else:
            # t[0] corresponds to the translated numeric data
            # t[1] corresponds to a list with the possible values for each feature'
            # (different values in a column, e.g. [sim, não]).
            # the index of that value in the list corresponds to its numeric representation
            # (e.g. [sim, não] -> sim is represented by 0 and não by 1).
            # if(missing_input != 'none' and treatment[index] != 'bin'):
            # 	imp = preprocessing.Imputer(missing_values=-1,strategy=missing_input,axis=0)
            # 	temp = imp.fit_transform(X=temp.reshape(-1,1))

            if (treatment[index] == 'bin'):
                #imp = preprocessing.Imputer(missing_values=-1,strategy='mean',axis=0)
                #temp = imp.fit_transform(X=np.array(temp).reshape(-1,1))

                temp = pd.get_dummies(np.ravel(data[attribute].values))
                for x in temp.columns:
                    attributes.append(attribute + '=' + x)
                    #print(temp[x].reshape(-1,1).shape)
                    transformedData.append(temp[x].reshape(-1, 1))

            elif (treatment[index] == 'int'):
                if (not transform_numeric):
                    temp = data[attribute].values
                    for temp_index in range(len(temp)):
                        if (isinstance(temp[temp_index], str)):
                            temp[temp_index] = temp[temp_index].upper()

                    i = utils.firstNotNan(data[attribute].values)
                    if (utils.isint(data[attribute].values[i])
                            and missing_input != 'none'):
                        temp[data[attribute].values == 'NAAI'] = -1
                        temp[np.isnan(
                            np.array(data[attribute].values,
                                     dtype=float))] = -1
                        imp = preprocessing.Imputer(missing_values=-1,
                                                    strategy=missing_input,
                                                    axis=0)
                        temp = imp.fit_transform(
                            X=np.array(list(int(x)
                                            for x in temp)).reshape(-1, 1))

                elif (missing_input != 'none'):
                    imp = preprocessing.Imputer(missing_values=np.nan,
                                                strategy=missing_input,
                                                axis=0)
                    temp = imp.fit_transform(X=np.array(temp).reshape(-1, 1))

                #print(np.array(temp).reshape(-1,1).shape)
                transformedData.append(np.array(temp).reshape(-1, 1))

            elif (treatment[index] == 'date'):
                temp = []
                for date in data[attribute].values:
                    if (not isinstance(date, float)):
                        temp.append(int(date[:4]))
                    else:
                        temp.append(-1)
                if (missing_input != 'none'):
                    imp = preprocessing.Imputer(strategy='most_frequent',
                                                axis=0)

                    temp = imp.fit_transform(X=np.array(temp).reshape(-1, 1))

                #print(np.array(temp).reshape(-1,1).shape)
                transformedData.append(np.array(temp).reshape(-1, 1))

            elif (use_text and treatment[index] == 'text'):
                #try:
                bigword = ''
                #print(attribute)
                try:
                    bag_of_words = CountVectorizer(min_df=0.25,
                                                   stop_words=sw,
                                                   ngram_range=(1, 4))
                    #print(data[attribute])
                    words = np.array(
                        bag_of_words.fit_transform(
                            ((data[attribute].values))).todense())
                    c = 0
                    for word in bag_of_words.get_feature_names():
                        bigword = bigword + word + ' '
                        attributes.append(attribute + ' termo: ' + word)
                        transformedData.append(words[:, c].reshape(-1, 1))
                        c += 1
                        # wordcloud = WordCloud(stopwords=sw,background_color='white').generate(bigword,)
                        # plt.imshow(wordcloud)
                        # plt.axis('off')
                        # plt.show()
                except (ValueError, AttributeError):

                    index += 1
                    continue
            else:
                index += 1
                continue
            # else:
            # 	print('undefined option for pre processing: (%s, %s) ' % (categ) )
            # 	exit(-1)

        categories.append(t[1])
        if (treatment[index] != 'text' and treatment[index] != 'bin'):
            attributes.append(attribute)

        index += 1

    data = np.array(transformedData).reshape(-1, n_samples).T
    data = pd.DataFrame(data, columns=attributes)

    # pd.DataFrame(data,columns=attributes).to_csv('out.csv', index=False)
    # f = open('DorR.csv', 'w')
    # f.write(','.join(np.array(attributes, dtype=object)))

    # for d in data:
    # 	f.write('\n')
    # 	f.write(','.join(str(dd) for dd in d))
    # exit()

    return data
Example #20
0
def parseUCSCHeader(header, header_prefix='>', retainKeys=True, toInt=True):
    '''
    Parse UCSC Table Browser FASTA header.

    Example: hg38_knownGene_ENST00000376838.5_0 range=chr1:11130526-11131568 5'pad=10 3'pad=3 strand=- repeatMasking=none

    Args
    - header: str
        FASTA header line
    - header_prefix: str. default='>'
        FASTA header line prefix
    - retainKeys: bool. default=True
        Retain original FASTA header keys, e.g, 5'pad and 3'pad, instead of the converted 
        valid Python identifiers, e.g., pad5 and pad3.
    - toInt: bool. default=True
        Where possible, convert string values to int. May impact performance.

    Returns: dict
    - Map of metadata of protein sequence.
    - Keys
      - name: sequence name
      - chrom: chromosome (chr#)
      - chromStart: start coordinate (browser format: 1-based start and end)
      - chromEnd: end coordinate (browser format: 1-based start and end)
      - 5'pad: extra bases at the 5' end of the feature
      - 3'pad: extra bases at the 3' end of the feature
      - strand: +/-
      - repeatMasking: mask repeats
        - none: no repeat masking
        - N: repeats are masked to N's
        - lower: repeats are masked to lower case
      - See https://genomebrowser.wustl.edu/goldenPath/help/hgTextHelp.html#FASTA for an older description
        of the FASTA header.
      - In the Table Browser, these options are specified after clicking 'get output'
    '''

    # strip whitespace and prefix
    header = header.strip()
    if header.startswith(header_prefix):
        header = header[len(header_prefix):]

    # extract key, value pairs from regex match to dict
    pattern = FASTA_HEADER_REGEX_UCSC
    m = pattern.match(header)
    data = m.groupdict()

    if retainKeys:
        data['5\'pad'] = data.pop('pad5')
        data['3\'pad'] = data.pop('pad3')

    # remove leading/trailing whitespace from each value in dict
    if toInt:
        # convert str to int if applicable
        for key, value in data.items():
            value = value.strip()
            if utils.isint(value):
                value = int(value)
            data[key] = value
    else:
        data = {key: value.strip() for key, value in data.items()}
    return data
Example #21
0
def plot_feature_contributions(X,
                               feature_index,
                               fcs,
                               attributes,
                               class_of_interest,
                               title=None):

    if (not utils.isint(X[utils.firstNotNan(
            X[:, feature_index])][feature_index]) and not utils.isfloat(
                X[utils.firstNotNan(X[:, feature_index])][feature_index])):
        values = [i for i in set(X[:, feature_index]) if not utils.isnan(i)
                  ] + [np.nan]

        pos_fcs = []
        neg_fcs = []
        pos_values = []
        neg_values = []
        zero_fcs = []
        zero_values = []
        contributions = {}

        for i in range(X.shape[0]):

            if (feature_index in fcs[i].keys()):
                if (fcs[i][feature_index][class_of_interest] > 0):
                    pos_fcs.append(fcs[i][feature_index][class_of_interest])
                    #this is necessary because of weird behavior when X[i][feature_index] is nan
                    #and for some reason it says that nan is not values
                    if (utils.isnan(X[i][feature_index])):
                        pos_values.append(len(values) - 1)
                    else:
                        pos_values.append(values.index(X[i][feature_index]))
                elif (fcs[i][feature_index][class_of_interest] == 0):
                    zero_fcs.append(0)
                    if (utils.isnan(X[i][feature_index])):
                        zero_values.append(len(values) - 1)
                    else:
                        zero_values.append(values.index(X[i][feature_index]))
                else:
                    neg_fcs.append(fcs[i][feature_index][class_of_interest])
                    if (utils.isnan(X[i][feature_index])):
                        neg_values.append(len(values) - 1)
                    else:
                        neg_values.append(values.index(X[i][feature_index]))
                if (X[i][feature_index] not in contributions.keys()):
                    contributions[X[i][feature_index]] = [
                        fcs[i][feature_index][class_of_interest]
                    ]
                else:
                    contributions[X[i][feature_index]].append(
                        fcs[i][feature_index][class_of_interest])

        print('Contributions:')
        for value in contributions.keys():
            print('Value %r' % value)
            print(
                '\nMean: %r Variance: %r' %
                (np.mean(contributions[value]), np.var(contributions[value])))

        c = (contributions.items())
        boxplot([a[1] for a in c], [a[0] for a in c], title=None)

        ax = plt.subplot(111)
        plt.plot(pos_fcs, pos_values, 'x', color='blue')
        plt.plot(neg_fcs, neg_values, 'x', color='red')
        plt.plot(zero_fcs, zero_values, 'x', color='black')
        plt.xlabel('feature contribution')
        plt.ylabel('values of feature %r' % attributes[feature_index])
        ax.set_yticks(np.array(range(len(values) + 2)) - 1)
        ax.set_yticklabels([str('')] + values + [str('')])
        plt.show()

    else:

        values = sorted([
            round(i, 4) for i in (set(X[:, feature_index]))
            if not utils.isnan(i)
        ])  # + [np.nan]

        nan_index = values[-1] - values[-2]
        pos_fcs = []
        neg_fcs = []
        pos_values = []
        neg_values = []
        zero_fcs = []
        zero_values = []
        contributions = {}

        for i in range(X.shape[0]):
            if (feature_index in fcs[i].keys()):

                if (fcs[i][feature_index][class_of_interest] > 0):
                    pos_fcs.append(fcs[i][feature_index][class_of_interest])
                    #this is necessary because of weird behavior when X[i][feature_index] is nan
                    #and for some reason it says that nan is not values
                    if (utils.isnan(X[i][feature_index])):
                        pos_values.append(values[-1] + nan_index)
                    else:
                        pos_values.append(X[i][feature_index])
                elif (fcs[i][feature_index][class_of_interest] == 0):
                    zero_fcs.append(0)
                    if (utils.isnan(X[i][feature_index])):
                        zero_values.append(values[-1] + nan_index)
                    else:
                        zero_values.append(X[i][feature_index])
                else:
                    neg_fcs.append(fcs[i][feature_index][class_of_interest])
                    if (utils.isnan(X[i][feature_index])):
                        neg_values.append(values[-1] + nan_index)
                    else:
                        neg_values.append((X[i][feature_index]))
                if (utils.isnan(X[i][feature_index])):
                    if ('nan' in contributions.keys()):
                        contributions['nan'].append(
                            fcs[i][feature_index][class_of_interest])
                    else:
                        contributions['nan'] = [
                            fcs[i][feature_index][class_of_interest]
                        ]
                elif (X[i][feature_index] in contributions.keys()):
                    contributions[(X[i][feature_index])].append(
                        fcs[i][feature_index][class_of_interest])
                else:
                    contributions[(X[i][feature_index])] = [
                        fcs[i][feature_index][class_of_interest]
                    ]

        print('Contributions:')
        for value in contributions.keys():
            print('Value %r' % value)
            print(
                'Mean: %r Variance: %r' %
                (np.mean(contributions[value]), np.std(contributions[value])))
        c = (contributions.items())
        boxplot([a[1] for a in c], [a[0] for a in c], title=None)
        fig, ax = plt.subplots()
        plt.plot(pos_fcs, pos_values, 'x', color='blue')
        plt.plot(neg_fcs, neg_values, 'x', color='red')
        plt.plot(zero_fcs, zero_values, 'x', color='black')
        fig.canvas.draw()
        labels = [''] + [item.get_text()
                         for item in ax.get_yticklabels()] + ['']
        if (values[-1] + nan_index < ax.get_yticks()[-1]):
            plt.yticks(
                [values[0] - nan_index] +
                sorted(list(ax.get_yticks()) + [values[-1] + nan_index]))
        else:
            plt.yticks([values[0] - nan_index] + sorted(
                list(ax.get_yticks()) +
                [values[-1] + nan_index, values[-1] + 2 * nan_index]))
        labels[-2] = 'nan'

        plt.xlabel('feature contribution')
        plt.ylabel('values of feature %r' % attributes[feature_index])
        ax.set_yticklabels(labels)

        plt.show()

    if (title is not None):
        plt.savefig(title)
        plt.close()
Example #22
0
    def _parselinewithoutindent(self, line, tocmatcher):
        splits = line.split(maxsplit=1)
        groups = utils.groupwords(line)
        previous = self.elements[-1] if self.elements else None
        if line.lstrip()[:20] == "Forward references: ":
            # make it its own paragraph
            self.elements.append(elements.Paragraph(line))
            self._inelement = True
            return
        #print(tocmatcher._titlestack[-1], tocmatcher._headingstack[-1], line, sep='\t')
        if tocmatcher.matchtitle(line):
            if (re.match(toc.KEYREGEX, splits[0])
                    or re.match(fr"{toc.CHAPTERREGEX}\.", splits[0])):
                # numbered title
                self.elements.append(elements.NumberedTitleHeading(line))
            else:
                # title
                self.elements.append(elements.TitleHeading(line))
            self._inelement = False
            return
        if tocmatcher.matchheading(line):
            if len(splits) == 2:
                # numbered title
                self.elements.append(elements.NumberedTitleHeading(line))
            else:
                # heading
                self.elements.append(elements.NumberedHeading(line))
            self._inelement = False
            return
        if splits[0][0] in ("—", '•'):
            # list element
            self.elements.append(elements.UnorderedListItem(line))
            self._inelement = True
            return
        #print(splits)
        if splits[0][:-1].isdigit() and splits[0][-1] == '.':
            # ordered list
            self.elements.append(elements.OrderedListItem(line))
            self._inelement = True
            return
        if len(splits) == 2:
            if line[3].isspace():
                # maybe value definition with number as value
                if utils.isint(line[:3]):
                    # value definition of number
                    self.elements.append(elements.ValueDefinition(*splits))
                    self._inelement = True
                    return
                if line[0] == '−' and utils.isint(line[1:3]):
                    # value definition of number, but with a misunderstood −
                    # U+2212 MINUS SIGN instead of a - U+002D HYPHEN-MINUS
                    self.elements.append(
                        elements.ValueDefinition(splits[0].replace('−', '-'),
                                                 splits[1]))
                    self._inelement = True
                    return
            if splits[0][:2] == "__" and splits[0][-2:] == "__":
                # value definition of preprocessing macro
                self.elements.append(elements.ValueDefinition(*splits))
                self._inelement = True
                return

            if (not line[:2].isspace()  # no or little indent
                    and len(groups) == 2):  # only 2 groups
                stripped = line.lstrip()  # remove any indentation
                l = len(splits[0])
                # if the second split starts between the 12th or 15th column,
                # and there are at least 4 spaces between splits
                if (12 <= stripped.find(splits[1], l) <= 15
                        and stripped[l:l + 4].isspace()):
                    # value definition of anything
                    self.elements.append(elements.ValueDefinition(*splits))
                    self._inelement = True
                    return

        if line[:4].isspace():
            # indented text?
            if self._inelement:
                # maybe it's part of the previous element ?
                def maybepreviouselement(previouselement, line):
                    if isinstance(previouselement, elements.Code):
                        return True
                    if isinstance(previouselement, elements.ValueDefinition):
                        return True
                    if isinstance(previouselement, elements.OrderedListItem):
                        return True
                    if isinstance(previouselement, elements.UnorderedListItem):
                        indent = previouselement.indent + 2
                        if (line[:indent].isspace()
                                and not line[indent].isspace()):
                            return True
                    return False

                if maybepreviouselement(previous, line):
                    previous.addcontent(line)
                    return
            if (self.elements
                    and isinstance(previous, elements.UnorderedListItem)
                    and previous.level > 1):
                # indented paragraph, inside a list
                self.elements.append(elements.Paragraph(line))
                self._inelement = True
                return
            if line[:7].isspace():
                # code block
                # we already checked for _inelement
                self.elements.append(elements.Code(line))
                self._inelement = True
                return
            # idk, make it a paragraph

        # regular text
        if (self._inelement and isinstance(previous, elements.Text)
                and not isinstance(previous, elements.ValueDefinition)):
            # continuation of previous text element
            previous.addcontent(line)
            return
        # new paragraph
        self.elements.append(elements.Paragraph(line))
        self._inelement = True
        return
Example #23
0
def encode(s, **kwargs):
    if s.startswith('0x'):
        s = int(s[2:], 16)
    if not isint(s):
        s = bytes_to_long(s.encode())
    return bin(s)[2:]
Example #24
0
import decisionTree as dt
import randomForest as rf
import numpy as np
import pandas as pd
import read
import utils
import math
from sklearn.metrics import accuracy_score

dummy = False
transform = False
use_text = False

print('Testing utils...')
assert (utils.isint(10))
assert (utils.isint('50'))
assert (utils.isint('-999'))
assert (not utils.isint(1.0))
assert (not utils.isint('50.0'))
assert (utils.isint(True))
assert (not utils.isint('aba'))
assert (not utils.isint('a?a'))
assert (not utils.isint('49.x'))
assert (not utils.isfloat('0.x'))
assert (utils.isfloat('0.0'))
assert (utils.isfloat('12.984'))
assert (utils.isfloat('-0.4'))
assert (not utils.isfloat('9'))

original_attributes = np.array(
    ['Outlook', 'Temp', 'Humidity', 'Windy?', 'Class'])