def normalize_tokens(token_spans, min_upper): ''' DESCRIPTION: normalize tokens: lowercase, remove extra whitespaces, remove punctuation and remove accents. Parameters ---------- token_spans: list min_upper: int. S It specifies the minimum number of characters of a word to lowercase it (to prevent mistakes with acronyms). Returns ------- token_span_processed2token_span: python dict It relates the normalized token combinations with the original unnormalized ones. ''' token_span2token_span = dict(zip(token_spans, token_spans)) # Lowercase token_span_lower2token_span = dict((k.lower(), v) if len(k) > min_upper else (k,v) for k,v in token_span2token_span.items()) # Remove whitespaces token_span_bs2token_span = dict((re.sub('\s+', ' ', k).strip(), v) for k,v in token_span_lower2token_span.items()) # Remove punctuation token_span_punc2token_span = dict((k.translate(str.maketrans('', '', string.punctuation)), v) for k,v in token_span_bs2token_span.items()) # Remove accents token_span_processed2token_span = dict((remove_accents(k), v) for k,v in token_span_punc2token_span.items()) return token_span_processed2token_span
def format_text_info(txt, min_upper): ''' DESCRIPTION: 1. Obtain list of words of interest in text (no STPW and longer than 1 character) 2. Obtain dictionary with words of interest and their position in the original text. Words of interest are normalized: lowercased and removed accents. Parameters ---------- txt: str contains the text to format. min_upper: int. Specifies the minimum number of characters of a word to lowercase it (to prevent mistakes with acronyms). Returns ------- words_processed2pos: dictionary It relates the word normalzied (trimmed, removed stpw, lowercased, removed accents) and its position in the original text. words_final: set set of words in text. ''' # Get individual words and their position in original txt words = tokenize(txt) #print(words) # Remove beginning and end punctuation and whitespaces. words_no_punctuation = list( map(lambda x: x.strip(string.punctuation + ' '), words)) #print(words_no_punctuation) # Remove stopwords and single-character words large_words = list(filter(lambda x: len(x) > 1, words_no_punctuation)) #print(large_words) words_no_stw = set( filter(lambda x: x.lower() not in STOP_WORDS, large_words)) #print(words_no_stw) # Create dict with words and their positions in text words2pos = {} for word in words_no_stw: occurrences = list(re.finditer(re.escape(word), txt)) if len(occurrences) == 0: print('ERROR: ORIGINAL WORD NOT FOUND IN ORIGINAL TEXT') print(word) pos = list(map(lambda x: x.span(), occurrences)) words2pos[word] = pos #print(words2pos) # Dictionary relating original words with words processed words2words = dict(zip(words_no_stw, words_no_stw)) words2words_processed = dict( (k, remove_accents(k.lower())) if len(k) > min_upper else ( k, remove_accents(k)) for k, v in words2words.items()) # Map original position to processed word words_processed2pos = {} for k, v in words2pos.items(): k_processed = words2words_processed[k] if k_processed not in words_processed2pos: words_processed2pos[k_processed] = v else: words_processed2pos[ k_processed] = words_processed2pos[k_processed] + v # Set of transformed words words_final = set(words_processed2pos) return words_final, words_processed2pos
def format_ann_info(df_annot, min_upper, with_notes=False): ''' DESCRIPTION: Build useful Python dicts from DataFrame with info from TSV file Parameters ---------- df_annot: pandas DataFrame With 4 columns: 'filename', 'label', 'code', 'span' min_upper: int. It specifies the minimum number of characters of a word to lowercase it (to prevent mistakes with acronyms). with_notes: bool whether we are predicting codes, or not Returns ------- file2annot: python dict file2annot_processed: python dict annot2label: python dict It has every unmodified annotation and its label. annot2annot_processed: python dict It has every unmodified annotation and the words it has normalized. ''' # Build useful Python dicts from DataFrame with info from .ann files file2annot = {} for filename in list(df_annot.filename): file2annot[filename] = list( df_annot[df_annot['filename'] == filename].span) set_annotations = set(df_annot.span) annot2label = dict(zip(df_annot.span, df_annot.label)) if with_notes == True: annot2code = df_annot.groupby('span')['code'].apply( lambda x: x.tolist()).to_dict() else: annot2code = {} # Filter codes: remove duplicates and remove empty ONLY if there are annotations # with code and others without it fclean = lambda x: list( set(filter(None, x) if any(map(lambda x: len(x) > 0, x)) else x)) NEWannot2code = dict((k, fclean(v)) for k, v in annot2code.items()) annot2annot = dict(zip(set_annotations, set_annotations)) # Split values: {'one': 'three two'} must be {'one': ['three', 'two']} annot2annot_split = annot2annot.copy() annot2annot_split = dict( (k, tokenize(v)) for k, v in annot2annot_split.items()) # Do not store stopwords or single-character words as values for k, v in annot2annot_split.items(): annot2annot_split[k] = list(filter(lambda x: x not in STOP_WORDS, v)) for k, v in annot2annot_split.items(): annot2annot_split[k] = list(filter(lambda x: len(x) > 1, v)) # Trim punctuation or multiple spaces annot2annot_trim = annot2annot.copy() for k, v in annot2annot_split.items(): annot2annot_trim[k] = list( map(lambda x: x.strip(string.punctuation + ' '), v)) # Lower case values annot2annot_lower = annot2annot_trim.copy() for k, v in annot2annot_trim.items(): annot2annot_lower[k] = list( map(lambda x: x.lower() if len(x) > min_upper else x, v)) # remove accents from annotations annot2annot_processed = annot2annot_lower.copy() for k, v in annot2annot_lower.items(): annot2annot_processed[k] = list(map(lambda x: remove_accents(x), v)) # file2unaccented annotations file2annot_processed = {} for (k, v) in file2annot.items(): aux = list(map(lambda x: annot2annot_processed[x], v)) file2annot_processed[k] = aux return file2annot, file2annot_processed, annot2label, annot2annot_processed, NEWannot2code