Ejemplo n.º 1
0
        def _splitByDate(pattern: Pattern, content: str) -> List[str]:
            '''
                Splitting whole *.txt file content using
                date extraction regex we just built
            '''

            def _getTimeFormatRegex() -> Pattern:
                '''
                    Returns regular expression for extracting AM/PM pattern
                    from chat timestamp, where AM/PM could be prefixed with "\s" -> whitespace
                '''
                return reg_compile(r'^(\s?[a|p]m)$', flags=IGNORECASE)

            _timeFormatRegex = _getTimeFormatRegex()

            splitted = list(filter(lambda v: not _timeFormatRegex.search(v),
                                   filter(lambda v: len(v) != 0,
                                          filter(lambda v: v, pattern.split(content)))))

            index = -1
            for k, v in enumerate(splitted):
                if k != 0 and pattern.search(v):
                    index = k
                    break

            if index == -1:
                return splitted

            return splitted[index:]
Ejemplo n.º 2
0
def extract_tokens(frame: pd.DataFrame, raw_column_name: str,
                   split_pattern: re.Pattern) -> Set:
    tags = (frame.assign(tag=lambda f: f[raw_column_name].apply(
        lambda x: split_pattern.split(str(x)))).explode(
            'tag').loc[lambda f: f['tag'].apply(lambda x: len(x.strip()) > 0)]
            ['tag'].tolist())

    return tags
Ejemplo n.º 3
0
def get_drug_names_by_suffix(drug_name: str, suffixes: List[str],
                             split_chars: re.Pattern,
                             remove_chars: re.Pattern):
    drug_name_token_list = []
    drug_name = drug_name.lower()
    drug_token = split_chars.split(drug_name)
    drug_token = [remove_chars.sub("", token) for token in drug_token]
    for token in drug_token:
        for suffix in suffixes:
            if token.endswith(suffix):
                drug_name_token_list.append(token)
                break

    return drug_name_token_list
Ejemplo n.º 4
0
def cut_part(
        text: str,
        split_pattern: re.Pattern,
        with_spliter: bool = True,
        with_offset: bool = False) -> list:
    """
    Cut text to parts by the given Regex Pattern.

    Parameters
    ----------
    text: raw text.
    split_pattern: how to split text.
    with_spliter: whether the parts contain spliters.
    with_offset: whether the parts contain offsets.

    Returns
    --------
    out: cutted parts.
    """
    spliters = split_pattern.findall(text)
    length = len(spliters)
    lst = []
    start = 0
    for i, part in enumerate(split_pattern.split(text)):
        if i < length:
            if with_spliter:
                part = part + spliters[i]
                len_spliter = 0
            else:
                len_spliter = len(spliters[i])
        else:
            len_spliter = 0
        end = start + len(part) + len_spliter
        if part:
            if with_offset:
                item = (part, start, end)
            else:
                item = part
            lst.append(item)
        start = end
    return lst
Ejemplo n.º 5
0
def resplit(chunks_of_a_file: Iterator,
            split_pattern: re.Pattern) -> Iterable[ByteString]:
    """
    Reads chunks of a file one chunk at a time, 
    splits them into data rows by `split_pattern` 
    and joins partial data rows across chunk boundaries.
    borrowed from https://bugs.python.org/issue1152248#msg223491
    """
    partial_line = None
    for chunk in chunks_of_a_file:
        if partial_line:
            partial_line = b"".join((partial_line, chunk))
        else:
            partial_line = chunk
        if not chunk:
            break
        lines = split_pattern.split(partial_line)
        partial_line = lines.pop()
        yield from lines
    if partial_line:
        yield partial_line
Ejemplo n.º 6
0
def process_mentioning_comment(body: str, bot_re: re.Pattern) -> list:
    """
    Given a comment body and a bot name parse this into a tuple of (command, arguments)
    """
    lines = body.splitlines()
    lines = [
        l.strip()
        for l in lines
        if (bot_re.search(l) and not l.startswith(">"))
        or l.startswith("!msbox")
        or l.startswith("bot>")
    ]
    nl = []
    for l in lines:
        if l.startswith("!msbox"):
            nl.append(l.split("!msbox")[-1].strip())
        elif l.startswith("bot>"):
            nl.append(l.split("bot>")[-1].strip())
        else:
            nl.append(bot_re.split(l)[-1].strip())

    command_args = [_strip_extras(l).split(" ", 1) for l in nl]
    command_args = [c if len(c) > 1 else [c[0], None] for c in command_args]
    return command_args
Ejemplo n.º 7
0
 def __splitByDate__(pattern: Pattern, content: str) -> List[str]:
     return pattern.split(content)[1:]