def _splitByDate(pattern: Pattern, content: str) -> List[str]: ''' Splitting whole *.txt file content using date extraction regex we just built ''' def _getTimeFormatRegex() -> Pattern: ''' Returns regular expression for extracting AM/PM pattern from chat timestamp, where AM/PM could be prefixed with "\s" -> whitespace ''' return reg_compile(r'^(\s?[a|p]m)$', flags=IGNORECASE) _timeFormatRegex = _getTimeFormatRegex() splitted = list(filter(lambda v: not _timeFormatRegex.search(v), filter(lambda v: len(v) != 0, filter(lambda v: v, pattern.split(content))))) index = -1 for k, v in enumerate(splitted): if k != 0 and pattern.search(v): index = k break if index == -1: return splitted return splitted[index:]
def extract_tokens(frame: pd.DataFrame, raw_column_name: str, split_pattern: re.Pattern) -> Set: tags = (frame.assign(tag=lambda f: f[raw_column_name].apply( lambda x: split_pattern.split(str(x)))).explode( 'tag').loc[lambda f: f['tag'].apply(lambda x: len(x.strip()) > 0)] ['tag'].tolist()) return tags
def get_drug_names_by_suffix(drug_name: str, suffixes: List[str], split_chars: re.Pattern, remove_chars: re.Pattern): drug_name_token_list = [] drug_name = drug_name.lower() drug_token = split_chars.split(drug_name) drug_token = [remove_chars.sub("", token) for token in drug_token] for token in drug_token: for suffix in suffixes: if token.endswith(suffix): drug_name_token_list.append(token) break return drug_name_token_list
def cut_part( text: str, split_pattern: re.Pattern, with_spliter: bool = True, with_offset: bool = False) -> list: """ Cut text to parts by the given Regex Pattern. Parameters ---------- text: raw text. split_pattern: how to split text. with_spliter: whether the parts contain spliters. with_offset: whether the parts contain offsets. Returns -------- out: cutted parts. """ spliters = split_pattern.findall(text) length = len(spliters) lst = [] start = 0 for i, part in enumerate(split_pattern.split(text)): if i < length: if with_spliter: part = part + spliters[i] len_spliter = 0 else: len_spliter = len(spliters[i]) else: len_spliter = 0 end = start + len(part) + len_spliter if part: if with_offset: item = (part, start, end) else: item = part lst.append(item) start = end return lst
def resplit(chunks_of_a_file: Iterator, split_pattern: re.Pattern) -> Iterable[ByteString]: """ Reads chunks of a file one chunk at a time, splits them into data rows by `split_pattern` and joins partial data rows across chunk boundaries. borrowed from https://bugs.python.org/issue1152248#msg223491 """ partial_line = None for chunk in chunks_of_a_file: if partial_line: partial_line = b"".join((partial_line, chunk)) else: partial_line = chunk if not chunk: break lines = split_pattern.split(partial_line) partial_line = lines.pop() yield from lines if partial_line: yield partial_line
def process_mentioning_comment(body: str, bot_re: re.Pattern) -> list: """ Given a comment body and a bot name parse this into a tuple of (command, arguments) """ lines = body.splitlines() lines = [ l.strip() for l in lines if (bot_re.search(l) and not l.startswith(">")) or l.startswith("!msbox") or l.startswith("bot>") ] nl = [] for l in lines: if l.startswith("!msbox"): nl.append(l.split("!msbox")[-1].strip()) elif l.startswith("bot>"): nl.append(l.split("bot>")[-1].strip()) else: nl.append(bot_re.split(l)[-1].strip()) command_args = [_strip_extras(l).split(" ", 1) for l in nl] command_args = [c if len(c) > 1 else [c[0], None] for c in command_args] return command_args
def __splitByDate__(pattern: Pattern, content: str) -> List[str]: return pattern.split(content)[1:]