コード例 #1
0
                # for the last word of a sentence
                else:
                    # separate sentences with line breaks and a period, if there is no other symbol to mark the end of sentence
                    if bool(re.search(r"\w*[.?!]$", w)) or bool(
                            re.search(r"\.[‘’“”\'\"]$", w)):
                        text_out.write('\n')
                    else:
                        text_out.write('.\n')

    text_out.close()


# abbreviations for sentence tokenisation
extra_abbreviations_en = [
    'dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'i.e', 'e.g', 'approx', 'apt',
    'appt', 'dept', 'est', 'min', 'max', 'misc', 'no', 'acc', 'fig', 'a.m',
    'p.m', 'a.d', 'b.c', 'etc', 'ca', 'cf', 'ed', 'est', 'f', 'ff', 'pres'
]
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
punkt_param = PunktParameters()
# add the abbreviations to the tokenizer
punkt_param.abbrev_types = set(extra_abbreviations_en)
my_tokenizer = PunktSentenceTokenizer(punkt_param)

# execute the preprocessing for all files in one directory
for f in os.listdir("Data/English/theses"):
    # look for English texts only
    if f.startswith("en"):
        process("Data/English/theses/" + f,
                "PreprocessedData/English/theses/" + f + ".txt")
コード例 #2
0
def marker_surr_patt(in_dir):
    """ Find most frequent POS tag patterns surrounding citation marker
    """

    punkt_param = PunktParameters()
    abbreviation = ['al', 'fig', 'e.g', 'i.e', 'eq', 'cf', 'ref', 'refs']
    punkt_param.abbrev_types = set(abbreviation)
    tokenizer = PunktSentenceTokenizer(punkt_param)

    file_names = os.listdir(in_dir)
    patt_comb_freq_map = {}
    patt_orig_freq_map = {}
    for file_idx, fn in enumerate(file_names):
        if file_idx%100 == 0:
            print('{}/{}'.format(file_idx, len(file_names)))
        path = os.path.join(in_dir, fn)
        aid, ext = os.path.splitext(fn)
        if ext != '.txt' or aid == 'log':
            continue

        if re.search(r'[a-z]', aid):
            split = re.search(r'[a-z][0-9]', aid).span()[0] + 1
            aid = aid[:split] + '/' + aid[split:]

        with open(path) as f:
            text = f.read()
        text = re.sub(E_G_PATT, 'e.g.', text)

        marker = ' \u241F '
        doc_len = len(text)
        for sent_idx, sent_edx in tokenizer.span_tokenize(text):
            sentence_orig = text[sent_idx:sent_edx]
            sentence = re.sub(CITE_MULTI_PATT, marker, sentence_orig)
            sentence = re.sub(QUOTE_PATT, ' {}.'.format(marker), sentence)
            if marker in sentence:
                words = pos_tag(sentence.split())
                words = [w for w in words if re.search(r'[\w|\u241F]', w[0])]
                sent_len = len(words)
                indices = [i for i, tup in enumerate(words)
                           if tup[0] == marker.strip()]
                for word_idx in indices:
                    word = words[word_idx][0]
                    if word == marker.strip():
                        patt_comb = [None, None, None, '[]', None, None, None]
                        patt_orig = [None, None, None, '[]', None, None, None]
                        for shift in range(-3, 4):
                            x_idx = shift+3
                            if shift == 0:
                                # marker itself
                                continue
                            if word_idx+shift < 0 or \
                                    word_idx+shift >= len(words):
                                patt_comb[x_idx] = '<EOS>'
                                patt_orig[x_idx] = '<EOS>'
                                continue
                            wrd = words[word_idx+shift][0]
                            pos = words[word_idx+shift][1]
                            patt_orig[x_idx] = pos
                            if 'V' in pos:
                                patt_comb[x_idx] = 'V'
                            elif pos in ['NN', 'NNS']:
                                patt_comb[x_idx] = 'NN'
                            elif pos in ['NNP', 'NNPS']:
                                patt_comb[x_idx] = 'NNP'
                            elif pos == 'IN':
                                patt_comb[x_idx] = 'IN'
                            elif 'JJ' in pos:
                                patt_comb[x_idx] = 'JJ'
                            elif 'W' in pos:
                                patt_comb[x_idx] = 'WH'
                            elif 'RB' in pos:
                                patt_comb[x_idx] = 'ADV'
                            elif 'PR' in pos:
                                patt_comb[x_idx] = 'PR'
                            elif wrd == 'FORMULA':
                                patt_comb[x_idx] = 'FORMULA'
                            elif wrd == 'FIGURE':
                                patt_comb[x_idx] = 'FIGURE'
                            elif wrd == 'TABLE':
                                patt_comb[x_idx] = 'TABLE'
                            else:
                                patt_comb[x_idx] = 'OTHER'
                        comb_id = '¦'.join(patt_comb)
                        orig_id = '¦'.join(patt_orig)
                        # # look at examples
                        # if orig_id == 'VBN¦IN¦NNP¦[]¦<EOS>¦<EOS>¦<EOS>':
                        #     print(sentence)
                        #     input()
                        #     print('.')
                        if comb_id not in patt_comb_freq_map:
                            patt_comb_freq_map[comb_id] = 0
                        patt_comb_freq_map[comb_id] += 1
                        if orig_id not in patt_orig_freq_map:
                            patt_orig_freq_map[orig_id] = 0
                        patt_orig_freq_map[orig_id] += 1
        # if file_idx > 200:
        #    break

    patt_comb_freq = sorted(patt_comb_freq_map.items(),
                            key=operator.itemgetter(1), reverse=True)
    patt_orig_freq = sorted(patt_orig_freq_map.items(),
                            key=operator.itemgetter(1), reverse=True)
    print('- - - C O M B - - -')
    for pid in patt_comb_freq[:25]:
        print(pid)
    print('- - - O R I G - - -')
    for pid in patt_orig_freq[:25]:
        print(pid)

    store_comb = []
    for tup in patt_comb_freq:
        pid = tup[0]
        freq = tup[1]
        if '[]¦<EOS>¦<EOS>¦<EOS>' in pid:
            new_pid = pid.replace('[]¦<EOS>¦<EOS>¦<EOS>', '<EOS>')
            store_comb.append((new_pid, freq))
    with open('marker_comb.json', 'w') as f:
        json.dump(store_comb, f)

    store_orig = []
    for tup in patt_orig_freq:
        pid = tup[0]
        freq = tup[1]
        if '[]¦<EOS>¦<EOS>¦<EOS>' in pid:
            new_pid = pid.replace('[]¦<EOS>¦<EOS>¦<EOS>', '<EOS>')
            store_orig.append((new_pid, freq))
    with open('marker_orig.json', 'w') as f:
        json.dump(store_orig, f)
コード例 #3
0
def tokenize_latin_words(string):
    """
    Tokenizer divides the string into a list of substrings
  
    >>> from cltk.corpus.utils.formatter import remove_non_ascii
    >>> text =  'Dices ἐστιν ἐμός pulchrum esse inimicos ulcisci.'
    >>> tokenize_latin_words(text)
    ['Dices', 'ἐστιν', 'ἐμός', 'pulchrum', 'esse', 'inimicos', 'ulcisci', '.']
  
    :param string: This accepts the string value that needs to be tokenized
    :returns: A list of substrings extracted from the string
    """
    from cltk.tokenize.latin_exceptions import latin_exceptions

    assert isinstance(string, str), "Incoming string must be type str."

    def matchcase(word):
        # From Python Cookbook
        def replace(m):
            text = m.group()
            if text.isupper():
                return word.upper()
            elif text.islower():
                return word.lower()
            elif text[0].isupper():
                return word.capitalize()
            else:
                return word

        return replace

    replacements = [(r'mecum', 'cum me'), (r'tecum', 'cum te'),
                    (r'secum', 'cum se'), (r'nobiscum', 'cum nobis'),
                    (r'vobiscum', 'cum vobis'), (r'quocum', 'cum quo'),
                    (r'quacum', 'cum qua'), (r'quicum', 'cum qui'),
                    (r'quibuscum', 'cum quibus'), (r'sodes', 'si audes'),
                    (r'satin', 'satis ne'), (r'scin', 'scis ne'),
                    (r'sultis', 'si vultis'), (r'similist', 'similis est'),
                    (r'qualist', 'qualis est')]

    for replacement in replacements:
        string = re.sub(replacement[0],
                        matchcase(replacement[1]),
                        string,
                        flags=re.IGNORECASE)

    punkt_param = PunktParameters()
    abbreviations = [
        'c', 'l', 'm', 'p', 'q', 't', 'ti', 'sex', 'a', 'd', 'cn', 'sp', "m'",
        'ser', 'ap', 'n', 'v', 'k', 'mam', 'post', 'f', 'oct', 'opet', 'paul',
        'pro', 'sert', 'st', 'sta', 'v', 'vol', 'vop'
    ]
    punkt_param.abbrev_types = set(abbreviations)
    sent_tokenizer = PunktSentenceTokenizer(punkt_param)

    word_tokenizer = PunktLanguageVars()
    sents = sent_tokenizer.tokenize(string)

    enclitics = ['que', 'n', 'ue', 've', 'st']
    exceptions = enclitics
    exceptions = list(set(exceptions + latin_exceptions))

    tokens = []

    for sent in sents:
        temp_tokens = word_tokenizer.word_tokenize(sent)
        # Need to check that tokens exist before handling them;
        # needed to make stream.readlines work in PlaintextCorpusReader

        if temp_tokens:
            if temp_tokens[0].endswith('ne'):
                if temp_tokens[0].lower() not in exceptions:
                    temp = [temp_tokens[0][:-2], '-ne']
                    temp_tokens = temp + temp_tokens[1:]

            if temp_tokens[-1].endswith('.'):
                final_word = temp_tokens[-1][:-1]
                del temp_tokens[-1]
                temp_tokens += [final_word, '.']

            for token in temp_tokens:
                tokens.append(token)

    # Break enclitic handling into own function?
    specific_tokens = []

    for token in tokens:
        is_enclitic = False
        if token.lower() not in exceptions:
            for enclitic in enclitics:
                if token.endswith(enclitic):
                    if enclitic == 'n':
                        specific_tokens += [token[:-len(enclitic)]] + ['-ne']
                    elif enclitic == 'st':
                        if token.endswith('ust'):
                            specific_tokens += [token[:-len(enclitic) + 1]
                                                ] + ['est']
                        else:
                            specific_tokens += [token[:-len(enclitic)]
                                                ] + ['est']
                    else:
                        specific_tokens += [token[:-len(enclitic)]
                                            ] + ['-' + enclitic]
                    is_enclitic = True
                    break
        if not is_enclitic:
            specific_tokens.append(token)

    return specific_tokens
コード例 #4
0
def read_xml(path):  # pylint: disable=too-many-locals
    """read nxml, xml, html
    """
    try:
        with open(path, 'rb') as f:
            s = f.read()
        s = s.decode('utf8')
        s = s.replace('<break/>', ', ')
        soup = BeautifulSoup(s, 'lxml')

        title = soup.find('article-title')
        title = title.getText(' ') if title is not None else ''
        title = clean_text(title)

        body = [title]
        tables = []

        punkt_param = PunktParameters()
        punkt_param.abbrev_types = set(['fig'])
        tokenizer = PunktSentenceTokenizer(punkt_param)

        for tb in soup.findAll('table'):
            table = {'cells': []}
            for tr in tb.findAll(['tr']):
                row_elements = []
                for td in tr.findAll(['td', 'th']):
                    row_elements.append({'text': clean_text(td.getText(' '))})
                table['cells'].append(row_elements)

            parent = tb
            while parent is not None and parent.find('label') is None:
                parent = parent.find_parent()
            if parent is not None:
                label = parent.find('label').getText(' ')
                caption_obj = parent.find('caption')
                if caption_obj is not None:
                    caption = caption_obj.getText(' ')
                else:
                    caption = ''
            else:
                label, caption = None, None

            table.update({'caption': {
                'text': caption,
                'label': label,
            }})
            tables.append(table)

        for tag in ['notes', 'ref_list', 'floats-group']:
            for element in soup.findAll(tag):
                element.decompose()

        for paragraph in soup.findAll('p'):
            for t in paragraph.findAll('table'):
                t.extract()
            p = map(clean_text, paragraph.getText(' ').split())
            p = ' '.join(filter(bool, p))
            body += tokenizer.tokenize(p)
        body = '\n'.join(body)

        data = PaperData(body, tables)

    except Exception:
        logger.info('fail: %s', path)
        traceback.print_exc()
        return PaperData()

    return data
コード例 #5
0
 def __init__(self):
     self.punkt_param = PunktParameters()
     self.punkt_param.abbrev_types = set(ABBREVIATIONS)
     self.sent_tokenizer = LatinPunktSentenceTokenizer()
     self.word_tokenizer = LatinLanguageVars()
コード例 #6
0
 def init(self):
     if self.sent_tokeniser_ is None:
         punkt_param = PunktParameters()
         punkt_param.abbrev_types = self.compile_abbreviations()
         self.sent_tokeniser_ = PunktSentenceTokenizer(punkt_param)
コード例 #7
0
ファイル: word.py プロジェクト: saikswaroop/cltk
def tokenize_latin_words(string):
    from cltk.tokenize.latin_exceptions import latin_exceptions

    assert isinstance(string, str), "Incoming string must be type str."

    def matchcase(word):
        # From Python Cookbook
        def replace(m):
            text = m.group()
            if text.isupper():
                return word.upper()
            elif text.islower():
                return word.lower()
            elif text[0].isupper():
                return word.capitalize()
            else:
                return word

        return replace

    replacements = [(r'mecum', 'cum me'),
                    (r'tecum', 'cum te'),
                    (r'secum', 'cum se'),
                    (r'nobiscum', 'cum nobis'),
                    (r'vobiscum', 'cum vobis'),
                    (r'quocum', 'cum quo'),
                    (r'quacum', 'cum qua'),
                    (r'quicum', 'cum qui'),
                    (r'quibuscum', 'cum quibus'),
                    (r'sodes', 'si audes'),
                    (r'satin', 'satis ne'),
                    (r'scin', 'scis ne'),
                    (r'sultis', 'si vultis'),
                    (r'similist', 'similis est'),
                    (r'qualist', 'qualis est')
                    ]

    for replacement in replacements:
        string = re.sub(replacement[0], matchcase(replacement[1]), string, flags=re.IGNORECASE)


    punkt_param = PunktParameters()
    abbreviations = ['c', 'l', 'm', 'p', 'q', 't', 'ti', 'sex', 'a', 'd', 'cn', 'sp', "m'", 'ser', 'ap', 'n', 'v', 'k', 'mam', 'post', 'f', 'oct', 'opet', 'paul', 'pro', 'sert', 'st', 'sta', 'v', 'vol', 'vop']
    punkt_param.abbrev_types = set(abbreviations)
    sent_tokenizer = PunktSentenceTokenizer(punkt_param)

    word_tokenizer = PunktLanguageVars()
    sents = sent_tokenizer.tokenize(string)

    enclitics = ['que', 'n', 'ue', 've', 'st']
    exceptions = enclitics
    exceptions = list(set(exceptions + latin_exceptions))

    tokens = []

    for sent in sents:
        temp_tokens = word_tokenizer.word_tokenize(sent)
        if temp_tokens[0].endswith('ne'):
            if temp_tokens[0].lower() not in exceptions:
                temp = [temp_tokens[0][:-2], '-ne']
                temp_tokens = temp + temp_tokens[1:]

        if temp_tokens[-1].endswith('.'):
            final_word = temp_tokens[-1][:-1]
            del temp_tokens[-1]
            temp_tokens += [final_word, '.']

        for token in temp_tokens:
            tokens.append(token)

    # Break enclitic handling into own function?
    specific_tokens = []

    for token in tokens:
        is_enclitic = False
        if token.lower() not in exceptions:
            for enclitic in enclitics:
                if token.endswith(enclitic):
                    if enclitic == 'n':
                        specific_tokens += [token[:-len(enclitic)]] + ['-ne']
                    elif enclitic == 'st':
                        if token.endswith('ust'):
                            specific_tokens += [token[:-len(enclitic) + 1]] + ['est']
                        else:
                            specific_tokens += [token[:-len(enclitic)]] + ['est']
                    else:
                        specific_tokens += [token[:-len(enclitic)]] + ['-' + enclitic]
                    is_enclitic = True
                    break
        if not is_enclitic:
            specific_tokens.append(token)

    return specific_tokens
コード例 #8
0
def data_to_words(data):
    global words
    punkt_param = PunktParameters()
    # 这里想对U.S缩略词进行处理,好像失败了
    abbreviation = ['U.S']
    punkt_param.abbrev_types = set(abbreviation)
    tokenizer = PunktSentenceTokenizer(punkt_param)
    pat_letter = re.compile(r'[^a-zA-Z \']+')
    # 对一些缩略词进行转换处理
    # to find the 's following the pronouns. re.I is refers to ignore case
    pat_is = re.compile("(it|he|she|that|this|there|here)(\'s)", re.I)
    # to find the 's following the letters
    pat_s = re.compile("(?<=[a-zA-Z])\'s")
    # to find the ' following the words ending by s
    pat_s2 = re.compile("(?<=s)\'s?")
    # to find the abbreviation of not
    pat_not = re.compile("(?<=[a-zA-Z])n\'t")
    # to find the abbreviation of would
    pat_would = re.compile("(?<=[a-zA-Z])\'d")
    # to find the abbreviation of will
    pat_will = re.compile("(?<=[a-zA-Z])\'ll")
    # to find the abbreviation of am
    pat_am = re.compile("(?<=[I|i])\'m")
    # to find the abbreviation of are
    pat_are = re.compile("(?<=[a-zA-Z])\'re")
    # to find the abbreviation of have
    pat_ve = re.compile("(?<=[a-zA-Z])\'ve")

    if isinstance(data, dict):
        for key, value in data.items():
            if isinstance(value, str) and (key == "id"
                                           or key == "is_impossible"):
                continue
            elif isinstance(value, int) and key == "answer_start":
                continue
            elif isinstance(value, str):
                #
                value = "".join(tokenizer.tokenize(value))
                # 通过正则去除标点,除'外
                value = pat_letter.sub(' ', value).strip().lower()
                value = pat_is.sub(r"\1 is", value)
                value = pat_s.sub("", value)
                value = pat_s2.sub("", value)
                value = pat_not.sub(" not", value)
                value = pat_would.sub(" would", value)
                value = pat_will.sub(" will", value)
                value = pat_am.sub(" am", value)
                value = pat_are.sub(" are", value)
                value = pat_ve.sub(" have", value)
                value = value.replace('\'', ' ')
                words.extend(WordPunctTokenizer().tokenize(value))
            else:
                data_to_words(value)
    elif isinstance(data, list):
        for i in data:
            if isinstance(i, str):
                i = "".join(tokenizer.tokenize(i))
                i = pat_letter.sub(' ', i).strip().lower()
                i = pat_is.sub(r"\1 is", i)
                i = pat_s.sub("", i)
                i = pat_s2.sub("", i)
                i = pat_not.sub(" not", i)
                i = pat_would.sub(" would", i)
                i = pat_will.sub(" will", i)
                i = pat_am.sub(" am", i)
                i = pat_are.sub(" are", i)
                i = pat_ve.sub(" have", i)
                i = i.replace('\'', ' ')
                words.extend(WordPunctTokenizer().tokenize(i))
            else:
                data_to_words(i)
    else:
        # print("{}".format(data))
        pass
    return words
コード例 #9
0
ファイル: preprocessor.py プロジェクト: johndpope/SumMe
import nltk
from nltk.tree import Tree
import os.path
from PreProcessing import parsers

#edit this when changind dirs
LangPaths = os.path.realpath(
    "C:/users/rihanna/Documents/Pol/ThesisIt/SumMe/Summarizer/langdetector/profiles/"
)
tltagger = nltk.data.load("taggers/filipino_aubt.pickle")  #filipino pos tagger

tlChunker = nltk.data.load(
    "chunkers/filipino_ub.pickle")  #filipino chunker here
enChunker = nltk.data.load("chunkers/conll2000_ub.pickle")  #enChunkerhere

punkt_param = PunktParameters()  #creates an opening for tokenizer parameters.
punkt_param.abbrev_types = set(['gng', 'mr', 'mrs', 'dr', 'rep'
                                ])  #abbreviations further accepted goes here

sentence_splitter = PunktSentenceTokenizer(punkt_param)
tokenized = ""
gateway = JavaGateway()
detector = gateway.entry_point
detector.init(LangPaths)


def LangDetect(str):
    return detector.detect(str)


def tokenizer(str):
コード例 #10
0
def get_pdf_objects(filename, table_detect=True):  # pylint: disable=too-many-locals
    """extract body, table, table images from pdf
    """
    body, tables = [], []

    pages = fitz.open(filename)
    page_images, page_image_data = pdf_to_image(filename)

    prev_caption = None
    for i, page in enumerate(pages):
        ratio = page_images[i].shape[0] / page.rect[3]

        page_dict = get_pdf_page_dict(page, ratio)

        pred_table_boxes = find_tables(
            page_image_data[i]) if table_detect else []
        page_tables = table_post_process(page_dict, pred_table_boxes,
                                         prev_caption)
        prev_caption = page_tables[-1]['caption'] if page_tables else None

        # seperate body blocks and table blocks
        table_blocks = [[] for _ in page_tables]

        for block in page_dict['blocks']:
            if block['type'] == 1:
                continue
            for j, table in enumerate(page_tables):
                if (not table['continued'] and overlap_ratio(
                        block['bbox'], table['caption']['bbox']) > 0.5):
                    break
                elif overlap_ratio(block['bbox'], table['bbox']) > 0.5:
                    table_blocks[j].append(block)
                    break
            else:
                body += get_lines(block)

        # construct table
        for j, (blocks, table) in enumerate(zip(table_blocks, page_tables)):
            table['cells'] = construct_table(blocks)

        # crop table images
        for table in page_tables:
            x1, y1, x2, y2 = table['bbox']
            image = page_images[i][y1:y2, x1:x2, :]
            if image.size == 0:
                continue
            image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
            img_data = cv2.imencode(
                '.jpg', image,
                [int(cv2.IMWRITE_JPEG_QUALITY), 75])[1].tostring()
            table['image'] = img_data

        tables += page_tables

    # sentence tokenize body text
    body = ' '.join(map(clean_text, body))
    punkt_param = PunktParameters()
    punkt_param.abbrev_types = set(['fig'])
    body = list(PunktSentenceTokenizer(punkt_param).tokenize(body))
    body = split_sents(body)

    return body, tables
コード例 #11
0
 def __init__(self):
     punkt_param = PunktParameters()
     punkt_param.abbrev_types = set(['e.g', 'ie', 'i.e', 'eg'])
     super().__init__(punkt_param)
コード例 #12
0
def sent_pos(in_dir):
    """ Positions of citation markers in sentences, relatve to where in doc
    """

    punkt_param = PunktParameters()
    abbreviation = ['al', 'fig', 'e.g', 'i.e', 'eq', 'cf', 'ref', 'refs']
    punkt_param.abbrev_types = set(abbreviation)
    tokenizer = PunktSentenceTokenizer(punkt_param)

    with open('hedge_words') as f:
        hedge_words = [l.strip() for l in f.readlines()]

    x = []
    y = []
    file_names = os.listdir(in_dir)
    buckets = []
    for foo in range(10):
        buckets.append([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
    for file_idx, fn in enumerate(file_names):
        if file_idx % 100 == 0:
            print('{}/{}'.format(file_idx, len(file_names)))
        path = os.path.join(in_dir, fn)
        aid, ext = os.path.splitext(fn)
        if ext != '.txt' or aid == 'log':
            continue
        with open(path) as f:
            text = f.read()
        text = re.sub(E_G_PATT, 'e.g.', text)
        # annot_fn = '{}_annot.json'.format(aid)
        # annot_path = os.path.join(in_dir, annot_fn)
        # if not os.path.isfile(annot_path):
        #     continue
        # with open(annot_path) as f:
        #     annots = json.load(f)

        marker = ' \u241F '
        doc_len = len(text)
        # ↓ word wise
        for sent_idx, sent_edx in tokenizer.span_tokenize(text):
            sentence_orig = text[sent_idx:sent_edx]
            sentence = re.sub(CITE_MULTI_PATT, marker, sentence_orig)
            sentence = re.sub(QUOTE_PATT, ' {}.'.format(marker), sentence)
            # determine contained annotations
            # annotated_words = []
            # for annot in annots:
            #     start = annot[0]
            #     end = annot[1]
            #     dbp_id = annot[2]
            #     annot_len = end - start
            #     in_sent_idx = start - sent_idx
            #     if start >= sent_idx and end <= sent_edx:
            #         disp = sentence_orig[in_sent_idx:in_sent_idx+annot_len]
            #         annotated_words.append(disp)
            if marker in sentence:
                doc_pos = 1 - (sent_idx / doc_len)
                buck_y_idx = math.floor(doc_pos * 10)
                if buck_y_idx == 10:
                    buck_y_idx = 9
                words = pos_tag(sentence.split())
                words = [w for w in words if re.search(r'[\w|\u241F]', w[0])]
                sent_len = len(words)
                sent_tags_str = ' '.join([tup[1] for tup in words])
                indices = [
                    i for i, tup in enumerate(words)
                    if tup[0] == marker.strip()
                ]
                # if 'JJS' not in sent_tags_str:
                #     continue
                for word_idx in indices:
                    word = words[word_idx][0]
                    # if word == marker.strip() and \
                    #     words[word_idx-1][1] == 'IN':

                    # if word == marker.strip() and \
                    #     ((word_idx > 0 and \
                    #       'FORMULA' not in words[word_idx-1][0] and \
                    #       words[word_idx-1][1] in ['NNP', 'NNPS']) or \
                    #      (word_idx > 1 and \
                    #       words[word_idx-1][1] in ['NN', 'NNS'] and \
                    #       'FORMULA' not in words[word_idx-2][0] and \
                    #       words[word_idx-2][1] in ['NNP', 'NNPS'])):

                    # if word == marker.strip() and \
                    #     (word_idx > 0 and \
                    #      words[word_idx-1][0] in annotated_words and \
                    #      words[word_idx-1][1] in ['NNP', 'NNPS']):

                    # if word == marker.strip() and \
                    #     word_idx+1 < len(words) and \
                    #     'VB' in words[word_idx+1][1]:

                    if word == marker.strip():
                        # print(words)
                        # print('doc pos:  {}'.format((sent_idx/doc_len)))
                        # print('sent pos: {}/{}'.format((word_idx+1),sent_len))
                        # input()
                        sent_pos = (word_idx + 1) / sent_len
                        y.append(doc_pos)
                        x.append(sent_pos)
                        buck_x_idx = math.floor(sent_pos * 10)
                        if buck_x_idx == 10:
                            buck_x_idx = 9
                        buckets[buck_y_idx][buck_x_idx] += 1
        # if file_idx > 1000:
        #     break

        # # ↓ character wise
        # for sent_idx, sentence in enumerate(sentences):
        #     # has_hw = False
        #     # for hw in hedge_words:
        #     #     if hw in sentence:
        #     #         has_hw = True
        #     #         break
        #     # if not has_hw:
        #     #     continue
        #     sent_len = len(sentence)
        #     doc_pos = 1 - (sent_idx/doc_len)
        #     buck_y_idx = math.floor(doc_pos*10)
        #     if buck_y_idx == 10:
        #         buck_y_idx = 9
        #     for cit_mark in re.finditer(marker, sentence):
        #         cm_idx = cit_mark.end()
        #         sent_pos = cm_idx/sent_len
        #         y.append(doc_pos)
        #         x.append(sent_pos)
        #         buck_x_idx = math.floor(sent_pos*10)
        #         if buck_x_idx == 10:
        #             buck_x_idx = 9
        #         buckets[buck_y_idx][buck_x_idx] += 1

    print('normalized row distributions:')
    for line in buckets:
        print(' '.join(['{:.2f}'.format(x / sum(line)) for x in line]))

    plt.xlabel('citation marker position in sentence')
    plt.ylabel('sentence position in document')

    heatmap, xedges, yedges = np.histogram2d(x, y, bins=(50))
    extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
    plt.imshow(heatmap.T, extent=extent, origin='lower', norm=LogNorm())
    # plt.imshow(heatmap.T, extent=extent, origin='lower')
    plt.colorbar()
    plt.show()

    plt.clf()

    plt.xlabel('citation marker position in sentence')
    plt.ylabel('sentence position in document')

    heatmap, xedges, yedges = np.histogram2d(x, y, bins=(50))
    extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
    # plt.imshow(heatmap.T, extent=extent, origin='lower', norm=LogNorm())
    plt.imshow(heatmap.T, extent=extent, origin='lower')
    plt.colorbar()
    plt.show()