def clean(train):
     List = train.iloc[:, 0].tolist()
     Regex = str.maketrans("", "", "'")
     word = [s.translate(Regex) for s in List]
     tokenized_sent = []
     for s in word:
         tokenized_sent.append(word_tokenize(s.lower()))
     return word, tokenized_sent
Exemple #2
0
def calc_format(val, linesep='\n', **opts):
    global options, depth, indent_level, line_sep_space

    if isinstance(val, str):
        return val

    if opts:
        options = opts
    else:
        if depth == 0:
            options = {'tex': config.latex, 'sci': 0, 'bin': 0, 'hex': 0}
        opts = options

    if config.latex or opts['tex']:
        s = latex(Matrix(val) if is_matrix(val) else val)
        # substitute the Greek letters to tex representations
        return translate(r'[^\x00-\x7F]', lambda m: gr_to_tex(m[0]), s)

    if linesep != '\n':
        line_sep_space = linesep
    else:
        linesep = line_sep_space

    def format_float(x):
        prec = config.precision
        return float(f'%.{prec}g' % x)

    def format_scinum(x):
        def positive_case(x):
            supscripts = '⁰¹²³⁴⁵⁶⁷⁸⁹'
            e = floor(log(x) / log(10))
            b = format_float(x / 10**e)
            supscript_pos = lambda n: ''.join(
                [supscripts[int(i)] for i in str(n)])
            supscript = lambda n: '⁻' + supscript_pos(
                -n) if e < 0 else supscript_pos(n)
            return f"{b}×10{supscript(e)}"

        if x == 0: return '0'
        return positive_case(x) if x > 0 else '-' + positive_case(-x)

    def format_matrix(mat):
        def row_str(row, start, end, sep='  '):
            return f"{start}{sep.join([s.ljust(space) for s in row])}{end}"

        mat = [[format(x) for x in row] for row in mat]
        space = max([max([len(s) for s in row]) for row in mat])
        col_num = len(mat[0])
        return f'{linesep}{indent}'.join(
            [row_str([''] * col_num, '╭', '╮')] +
            [row_str(row, ' ', ' ', ', ')
             for row in mat] + [row_str([''] * col_num, '╰', '╯')])

    def format_atom(val):
        if is_number(val):
            mag = abs(val)
            if type(val) is complex:
                re, im = format_float(val.real), format_float(val.imag)
                return f"{re} {'-' if im<0 else '+'} {abs(im)}ⅈ"
            elif mag == inf:
                return '∞'
            elif isinstance(val, Rational) and not opts['sci']:
                if type(val) is Fraction:
                    val.limit_denominator(10**config.precision)
                if opts['bin']: return bin(val)
                elif opts['hex']: return hex(val)
                else: return str(val)
            elif mag <= 0.001 or mag >= 10000:
                return format_scinum(val)
            else:
                return str(format_float(val))
        elif is_function(val):
            return str(val) if depth == 1 else repr(val)
        elif is_env(val):
            if hasattr(val, 'val'):
                return calc_format(val.val)
            else:
                return str(val) if depth == 1 else repr(val)
        elif isinstance(val, Range):
            return str(val)
        else:
            return pretty(val, use_unicode=True)

    depth += 1
    indent = ' ' * indent_width * indent_level
    s = indent
    if type(val) is tuple:
        if any(map(is_matrix, val)):
            indent_level += 1
            items = f',{linesep}'.join(map(calc_format, val))
            s += '[{0}{1}{0}{2}]'.format(linesep, items, indent)
            indent_level -= 1
        elif is_matrix(val):
            s += format_matrix(val)
        else:
            s += '[%s]' % ', '.join(map(calc_format, val))
    else:
        s += format_atom(val)
    depth -= 1
    return s
Exemple #3
0
def get_rc(re):
    """
    Return the reverse complement of a DNA/RNA RE.
    """
    return re.translate(str.maketrans('ACGTURYKMBVDHSWN',
                                      'TGCAAYRMKVBHDSWN'))[::-1]
Exemple #4
0
def format(val, indent=0, sci=False, tex=False):
    if config.latex or tex:
        s = latex(Matrix(val) if is_matrix(val) else val)
        # substitute the Greek letters to tex representations
        return translate(r'[^\x00-\x7F]', lambda m: gr_to_tex(m[0]), s)

    def format_float(x):
        prec = config.precision
        return float(f'%.{prec}g' % x)

    def format_scinum(x):
        def positive_case(x):
            supscripts = '⁰¹²³⁴⁵⁶⁷⁸⁹'
            e = floor(log(x) / log(10))
            b = format_float(x / 10**e)
            supscript_pos = lambda n: ''.join(
                [supscripts[int(i)] for i in str(n)])
            supscript = lambda n: '⁻' + supscript_pos(
                -n) if e < 0 else supscript_pos(n)
            return f"{b}×10{supscript(e)}"

        if x == 0: return '0'
        return positive_case(x) if x > 0 else '-' + positive_case(-x)

    def format_matrix(mat, indent):
        mat = [[format(x) for x in row] for row in mat]
        space = max([max([len(s) for s in row]) for row in mat])
        just_space = lambda s: s.ljust(space)
        row_str = lambda row, start, end, sep='  ': \
            ' '*indent + f"{start} {sep.join(map(just_space, row))}{end}"
        col_num = len(mat[0])
        return '\n'.join([row_str([''] * col_num, '╭', '╮')] +
                         [row_str(row, ' ', ' ', ', ') for row in mat] +
                         [row_str([''] * col_num, '╰', '╯')])

    def format_atom(val):
        if is_number(val):
            mag = abs(val)
            if type(val) == complex:
                re, im = format_float(val.real), format_float(val.imag)
                return f"{re} {'-' if im<0 else '+'} {abs(im)}ⅈ"
            elif mag == inf:
                return '∞'
            elif isinstance(val, Rational) and not sci:
                if type(val) == Fraction:
                    val.limit_denominator(10**config.precision)
                return str(val)
            elif mag <= 0.001 or mag >= 10000:
                return format_scinum(val)
            else:
                return str(format_float(val))
        elif type(val) is FunctionType:  # builtin
            return val.str
        elif isinstance(val, Range):
            return str(val)
        else:
            return pretty(val, use_unicode=True)

    s = ' ' * indent
    indented_format = lambda v: format(v, indent + 2)
    if is_list(val):
        contains_mat = False
        for a in val:
            if is_matrix(a): contains_mat = True
        if contains_mat:
            s += '[\n' + ',\n'.join(map(indented_format, val)) + '\n' + s + ']'
        elif is_matrix(val):
            s = format_matrix(val, indent)
        else:
            s += '[' + ', '.join(map(lambda v: format(v), val)) + ']'
    else:
        s += format_atom(val)
    return s