コード例 #1
0
def normalize_tokens(token_spans, min_upper):
    '''
    DESCRIPTION: normalize tokens: lowercase, remove extra whitespaces, 
    remove punctuation and remove accents.
    
    Parameters
    ----------
    token_spans: list
    min_upper: int. S
        It specifies the minimum number of characters of a word to lowercase
        it (to prevent mistakes with acronyms).

    Returns
    -------
    token_span_processed2token_span: python dict 
        It relates the normalized token combinations with the original unnormalized ones.
    '''
    token_span2token_span = dict(zip(token_spans, token_spans))
    
    # Lowercase
    token_span_lower2token_span = dict((k.lower(), v) if len(k) > min_upper else 
                                       (k,v) for k,v in token_span2token_span.items())

    # Remove whitespaces
    token_span_bs2token_span = dict((re.sub('\s+', ' ', k).strip(), v) for k,v 
                                    in token_span_lower2token_span.items())

    # Remove punctuation
    token_span_punc2token_span = dict((k.translate(str.maketrans('', '', string.punctuation)), v) for k,v in token_span_bs2token_span.items())
    
    # Remove accents
    token_span_processed2token_span = dict((remove_accents(k), v) for k,v in token_span_punc2token_span.items())
    
    return token_span_processed2token_span
コード例 #2
0
def format_text_info(txt, min_upper):
    '''
    DESCRIPTION: 
    1. Obtain list of words of interest in text (no STPW and longer than 1 character)
    2. Obtain dictionary with words of interest and their position in the 
    original text. Words of interest are normalized: lowercased and removed 
    accents.
    
    Parameters
    ----------
    txt: str 
        contains the text to format.
    min_upper: int. 
        Specifies the minimum number of characters of a word to lowercase it
        (to prevent mistakes with acronyms).
    
    Returns
    -------
    words_processed2pos: dictionary
        It relates the word normalzied (trimmed, removed stpw, lowercased, 
        removed accents) and its position in the original text.
    words_final: set
            set of words in text.
    '''

    # Get individual words and their position in original txt
    words = tokenize(txt)
    #print(words)

    # Remove beginning and end punctuation and whitespaces.
    words_no_punctuation = list(
        map(lambda x: x.strip(string.punctuation + ' '), words))
    #print(words_no_punctuation)

    # Remove stopwords and single-character words
    large_words = list(filter(lambda x: len(x) > 1, words_no_punctuation))
    #print(large_words)
    words_no_stw = set(
        filter(lambda x: x.lower() not in STOP_WORDS, large_words))
    #print(words_no_stw)
    # Create dict with words and their positions in text
    words2pos = {}
    for word in words_no_stw:
        occurrences = list(re.finditer(re.escape(word), txt))
        if len(occurrences) == 0:
            print('ERROR: ORIGINAL WORD NOT FOUND IN ORIGINAL TEXT')
            print(word)
        pos = list(map(lambda x: x.span(), occurrences))
        words2pos[word] = pos

    #print(words2pos)

    # Dictionary relating original words with words processed
    words2words = dict(zip(words_no_stw, words_no_stw))
    words2words_processed = dict(
        (k, remove_accents(k.lower())) if len(k) > min_upper else (
            k, remove_accents(k)) for k, v in words2words.items())
    # Map original position to processed word
    words_processed2pos = {}
    for k, v in words2pos.items():
        k_processed = words2words_processed[k]
        if k_processed not in words_processed2pos:
            words_processed2pos[k_processed] = v
        else:
            words_processed2pos[
                k_processed] = words_processed2pos[k_processed] + v

    # Set of transformed words
    words_final = set(words_processed2pos)

    return words_final, words_processed2pos
コード例 #3
0
def format_ann_info(df_annot, min_upper, with_notes=False):
    '''
    DESCRIPTION: Build useful Python dicts from DataFrame with info from TSV file
    
    Parameters
    ----------
    df_annot: pandas DataFrame 
        With 4 columns: 'filename', 'label', 'code', 'span'
    min_upper: int. 
        It specifies the minimum number of characters of a word to lowercase 
        it (to prevent mistakes with acronyms).
    with_notes: bool
        whether we are predicting codes, or not
        
    Returns
    -------
    file2annot: python dict
    file2annot_processed: python dict
    annot2label: python dict
        It has every unmodified annotation and its label.
    annot2annot_processed: python dict 
        It has every unmodified annotation and the words it has normalized.
    '''
    # Build useful Python dicts from DataFrame with info from .ann files
    file2annot = {}
    for filename in list(df_annot.filename):
        file2annot[filename] = list(
            df_annot[df_annot['filename'] == filename].span)

    set_annotations = set(df_annot.span)

    annot2label = dict(zip(df_annot.span, df_annot.label))

    if with_notes == True:
        annot2code = df_annot.groupby('span')['code'].apply(
            lambda x: x.tolist()).to_dict()
    else:
        annot2code = {}

    # Filter codes: remove duplicates and remove empty ONLY if there are annotations
    # with code and others without it
    fclean = lambda x: list(
        set(filter(None, x) if any(map(lambda x: len(x) > 0, x)) else x))
    NEWannot2code = dict((k, fclean(v)) for k, v in annot2code.items())

    annot2annot = dict(zip(set_annotations, set_annotations))

    # Split values: {'one': 'three two'} must be {'one': ['three', 'two']}
    annot2annot_split = annot2annot.copy()
    annot2annot_split = dict(
        (k, tokenize(v)) for k, v in annot2annot_split.items())

    # Do not store stopwords or single-character words as values
    for k, v in annot2annot_split.items():
        annot2annot_split[k] = list(filter(lambda x: x not in STOP_WORDS, v))
    for k, v in annot2annot_split.items():
        annot2annot_split[k] = list(filter(lambda x: len(x) > 1, v))

    # Trim punctuation or multiple spaces
    annot2annot_trim = annot2annot.copy()
    for k, v in annot2annot_split.items():
        annot2annot_trim[k] = list(
            map(lambda x: x.strip(string.punctuation + ' '), v))

    # Lower case values
    annot2annot_lower = annot2annot_trim.copy()
    for k, v in annot2annot_trim.items():
        annot2annot_lower[k] = list(
            map(lambda x: x.lower() if len(x) > min_upper else x, v))

    # remove accents from annotations
    annot2annot_processed = annot2annot_lower.copy()
    for k, v in annot2annot_lower.items():
        annot2annot_processed[k] = list(map(lambda x: remove_accents(x), v))

    # file2unaccented annotations
    file2annot_processed = {}
    for (k, v) in file2annot.items():
        aux = list(map(lambda x: annot2annot_processed[x], v))
        file2annot_processed[k] = aux

    return file2annot, file2annot_processed, annot2label, annot2annot_processed, NEWannot2code