def clean(train): List = train.iloc[:, 0].tolist() Regex = str.maketrans("", "", "'") word = [s.translate(Regex) for s in List] tokenized_sent = [] for s in word: tokenized_sent.append(word_tokenize(s.lower())) return word, tokenized_sent
def calc_format(val, linesep='\n', **opts): global options, depth, indent_level, line_sep_space if isinstance(val, str): return val if opts: options = opts else: if depth == 0: options = {'tex': config.latex, 'sci': 0, 'bin': 0, 'hex': 0} opts = options if config.latex or opts['tex']: s = latex(Matrix(val) if is_matrix(val) else val) # substitute the Greek letters to tex representations return translate(r'[^\x00-\x7F]', lambda m: gr_to_tex(m[0]), s) if linesep != '\n': line_sep_space = linesep else: linesep = line_sep_space def format_float(x): prec = config.precision return float(f'%.{prec}g' % x) def format_scinum(x): def positive_case(x): supscripts = '⁰¹²³⁴⁵⁶⁷⁸⁹' e = floor(log(x) / log(10)) b = format_float(x / 10**e) supscript_pos = lambda n: ''.join( [supscripts[int(i)] for i in str(n)]) supscript = lambda n: '⁻' + supscript_pos( -n) if e < 0 else supscript_pos(n) return f"{b}×10{supscript(e)}" if x == 0: return '0' return positive_case(x) if x > 0 else '-' + positive_case(-x) def format_matrix(mat): def row_str(row, start, end, sep=' '): return f"{start}{sep.join([s.ljust(space) for s in row])}{end}" mat = [[format(x) for x in row] for row in mat] space = max([max([len(s) for s in row]) for row in mat]) col_num = len(mat[0]) return f'{linesep}{indent}'.join( [row_str([''] * col_num, '╭', '╮')] + [row_str(row, ' ', ' ', ', ') for row in mat] + [row_str([''] * col_num, '╰', '╯')]) def format_atom(val): if is_number(val): mag = abs(val) if type(val) is complex: re, im = format_float(val.real), format_float(val.imag) return f"{re} {'-' if im<0 else '+'} {abs(im)}ⅈ" elif mag == inf: return '∞' elif isinstance(val, Rational) and not opts['sci']: if type(val) is Fraction: val.limit_denominator(10**config.precision) if opts['bin']: return bin(val) elif opts['hex']: return hex(val) else: return str(val) elif mag <= 0.001 or mag >= 10000: return format_scinum(val) else: return str(format_float(val)) elif is_function(val): return str(val) if depth == 1 else repr(val) elif is_env(val): if hasattr(val, 'val'): return calc_format(val.val) else: return str(val) if depth == 1 else repr(val) elif isinstance(val, Range): return str(val) else: return pretty(val, use_unicode=True) depth += 1 indent = ' ' * indent_width * indent_level s = indent if type(val) is tuple: if any(map(is_matrix, val)): indent_level += 1 items = f',{linesep}'.join(map(calc_format, val)) s += '[{0}{1}{0}{2}]'.format(linesep, items, indent) indent_level -= 1 elif is_matrix(val): s += format_matrix(val) else: s += '[%s]' % ', '.join(map(calc_format, val)) else: s += format_atom(val) depth -= 1 return s
def get_rc(re): """ Return the reverse complement of a DNA/RNA RE. """ return re.translate(str.maketrans('ACGTURYKMBVDHSWN', 'TGCAAYRMKVBHDSWN'))[::-1]
def format(val, indent=0, sci=False, tex=False): if config.latex or tex: s = latex(Matrix(val) if is_matrix(val) else val) # substitute the Greek letters to tex representations return translate(r'[^\x00-\x7F]', lambda m: gr_to_tex(m[0]), s) def format_float(x): prec = config.precision return float(f'%.{prec}g' % x) def format_scinum(x): def positive_case(x): supscripts = '⁰¹²³⁴⁵⁶⁷⁸⁹' e = floor(log(x) / log(10)) b = format_float(x / 10**e) supscript_pos = lambda n: ''.join( [supscripts[int(i)] for i in str(n)]) supscript = lambda n: '⁻' + supscript_pos( -n) if e < 0 else supscript_pos(n) return f"{b}×10{supscript(e)}" if x == 0: return '0' return positive_case(x) if x > 0 else '-' + positive_case(-x) def format_matrix(mat, indent): mat = [[format(x) for x in row] for row in mat] space = max([max([len(s) for s in row]) for row in mat]) just_space = lambda s: s.ljust(space) row_str = lambda row, start, end, sep=' ': \ ' '*indent + f"{start} {sep.join(map(just_space, row))}{end}" col_num = len(mat[0]) return '\n'.join([row_str([''] * col_num, '╭', '╮')] + [row_str(row, ' ', ' ', ', ') for row in mat] + [row_str([''] * col_num, '╰', '╯')]) def format_atom(val): if is_number(val): mag = abs(val) if type(val) == complex: re, im = format_float(val.real), format_float(val.imag) return f"{re} {'-' if im<0 else '+'} {abs(im)}ⅈ" elif mag == inf: return '∞' elif isinstance(val, Rational) and not sci: if type(val) == Fraction: val.limit_denominator(10**config.precision) return str(val) elif mag <= 0.001 or mag >= 10000: return format_scinum(val) else: return str(format_float(val)) elif type(val) is FunctionType: # builtin return val.str elif isinstance(val, Range): return str(val) else: return pretty(val, use_unicode=True) s = ' ' * indent indented_format = lambda v: format(v, indent + 2) if is_list(val): contains_mat = False for a in val: if is_matrix(a): contains_mat = True if contains_mat: s += '[\n' + ',\n'.join(map(indented_format, val)) + '\n' + s + ']' elif is_matrix(val): s = format_matrix(val, indent) else: s += '[' + ', '.join(map(lambda v: format(v), val)) + ']' else: s += format_atom(val) return s