Ejemplos de PunktSentenceTokenizer en Python, ejemplos de nltk.tokenize.punkt.PunktSentenceTokenizer en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: BuildTokenizer.py Proyecto: afcarl/MIMICTools

def train_tokenizer():
    trainer = punkt.PunktTrainer()
    trainer.INCLUDE_ALL_COLLOCS = True
    print 'Training the tokenizer on SemEval'
    for semeval_file in semeval_files:
        print 'File', semeval_file
        try:
            with open(semeval_file, 'r') as f:
                st = []
                for line in f:
                    st += [line.strip()]
                text = read_visit_sem(st)
                trainer.train(text, finalize=False)
        except IOError:
            pass
    trainer2 = copy.deepcopy(trainer)
    trainer2.finalize_training()
    tokenizer = punkt.PunktSentenceTokenizer(trainer2.get_params())
    out = open("tokenizer.pk", "wb")
    pickle.dump(tokenizer, out, -1)
    out.close()
    tokenizer = None
    trainer2 = None
    print 'Wrote tokenizer.'
    print 'Training the tokenizer on MIMIC'
    for notes_file in subset(notes_files, 15):  # 15 random MIMIC files
        print 'File', notes_file
        try:
            with open(notes_file, 'r') as f:
                ct = 0
                st = []
                for line in f:
                    ct += 1
                    if ct % 50000 == 0:
                        print ct
                    if line.strip() == '</VISIT>':
                        text = read_visit(st)
                        trainer.train(text, finalize=False)
                        st = []
                    elif line.strip() != '<VISIT>':
                        st += [line.strip()]
        except IOError:
            continue
        trainer2 = copy.deepcopy(trainer)
        trainer2.finalize_training()
        tokenizer = punkt.PunktSentenceTokenizer(trainer2.get_params())
        out = open("tokenizer.pk", "wb")
        pickle.dump(tokenizer, out, -1)
        out.close()
        print 'Wrote tokenizer.'

Ejemplo n.º 2

0

Mostrar archivo

Archivo: test_tokenize.py Proyecto: siawayforward/the-library-is-open

    def punkt_debug_decisions(self, input_text, n_sents, n_splits, lang_vars=None):
        tokenizer = punkt.PunktSentenceTokenizer()
        if lang_vars != None:
            tokenizer._lang_vars = lang_vars

        assert len(tokenizer.tokenize(input_text)) == n_sents
        assert len(list(tokenizer.debug_decisions(input_text))) == n_splits

Ejemplo n.º 3

0

Mostrar archivo

Archivo: textAnalyze.py Proyecto: shivamiiita/communication360

def getSentiment(text, language="en"):
    ct = pkt.PunktSentenceTokenizer(lang_vars=CustomLanguageVars())
    sentences = ct.tokenize(text)
    # sentences = [sentence.strip() for sentence in sentences if sentence != ""]

    documents = []
    for i, sentence in enumerate(sentences):
        documents.append({
            "language": language,
            "id": str(i),
            "text": sentence
        })

    r = requests.post(API_URL,
                      json={"documents": documents},
                      headers={
                          "Ocp-Apim-Subscription-Key": API_KEY,
                          "content-type": "application/json"
                      })

    if r.status_code != 200:
        print("Something went wrong: %s" % r.text)
        return [(r.text, 0)]

    results = json.loads(r.text)

    scores = [None] * len(sentences)
    for tup in results["documents"]:
        scores[int(tup["id"])] = tup["score"]
    return zip(sentences, scores)

Ejemplo n.º 4

0

Mostrar archivo

def clean_and_tag(row, text_col, csv_writer):
    """
    Clean given text and write each sentence to CSV
    """
    # set up sentence splitter with custom parameters
    punkt_params = punkt.PunktParameters()
    # sentences are not split ending on the given parameters, using {} creates a set literal
    punkt_params.abbrev_types = {
        'inc', 'inc ', '.tm', 'tm', 'no', 'i.v', 'drs', 'u.s'
    }
    # the tokenizer has to be unpickled so better do it once here than every time it is used
    sentence_splitter = punkt.PunktSentenceTokenizer(punkt_params)

    # clean up html tags
    plaintext = nltk.clean_html(row[text_col])
    # TODO coreference resolution to find more relevant sentences
    sentences = sentence_splitter.tokenize(plaintext)

    # maybe unecessary defensiveness...
    if len(sentences) > 0:
        for s in sentences:
            # remove punctuation, still want to add original sentence to CSV though
            #no_punct = re.findall(r'[\w\$\xc2()-]+', s)
            #no_punct = ' '.join(no_punct)
            tokens = nltk.word_tokenize(s)
            tags = nltk.pos_tag(tokens)

            # TODO parse tree info, something to do with stemming?
            # write row to file for each sentence
            row.append(tags)
            csv_writer.writerow(row)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: english_punkt_tokenizer.py Proyecto: nsecord/KeyBench

    def __init__(self):
        """Constructor.
    """

        super(EnglishPunktTokenizer, self).__init__()

        self._sentence_tokenizer = punkt.PunktSentenceTokenizer()
        self._word_tokenizer = punkt.PunktWordTokenizer()

Ejemplo n.º 6

0

Mostrar archivo

    def test_punkt_tokenize_no_custom_lang_vars(self):
        
        obj = punkt.PunktSentenceTokenizer()

        # We expect these sentences to not be split properly, as the Bengali full stop '।' is not included in the default language vars
        sentences = u"উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন। অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’  উপস্থিত ছিলেন। এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।"
        expected = ["উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন। অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’  উপস্থিত ছিলেন। এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।"]
        
        assert obj.tokenize(sentences) == expected

Ejemplo n.º 7

0

Mostrar archivo

def getSentencesWithWord(allNodes, noun):
    trainer = punkt.PunktSentenceTokenizer()
    trainer.train("data/real_estate.txt"
                  )  #Sentence fragmenter trained on real_estate (arbitrarily)
    sentences = []
    for node in allNodes:
        name = node.get("name")
        tokens = trainer.tokenize(node.get("text"))
        for token in tokens:
            if noun in token:
                sentences.append(token)
    return sentences

Ejemplo n.º 8

0

Mostrar archivo

    def test_punkt_tokenize_custom_lang_vars(self):
        
        # Create LangVars including a full stop end character as used in Bengali
        class BengaliLanguageVars(punkt.PunktLanguageVars):
            sent_end_chars = ('.', '?', '!', '\u0964')
        obj = punkt.PunktSentenceTokenizer(lang_vars = BengaliLanguageVars())

        # We now expect these sentences to be split up into the individual sentences
        sentences = u"উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন। অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’  উপস্থিত ছিলেন। এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।"
        expected = ["উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন।", "অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’  উপস্থিত ছিলেন।", "এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।"]
        
        assert obj.tokenize(sentences) == expected

Ejemplo n.º 9

0

Mostrar archivo

def set_up_tokenizer():
    """
    Set up sentence splitter with custom parameters and return to caller
    """
    punkt_params = punkt.PunktParameters()
    # sentences are not split ending on the given parameters, using {} creates a set literal
    punkt_params.abbrev_types = {'inc', '.tm', 'tm', 'no', 'i.v', 'dr', 'drs', 'u.s', 'u.k', 'ltd', 'vs', 'vol', 'corp',
                                 'jan', 'feb', 'mar', 'apr', 'jun', 'jul', 'aug', 'sep', 'sept', 'oct', 'nov', 'dec',
                                 'pm', 'p.m', 'am', 'a.m', 'mr', 'mrs', 'ms', 'i.e', 'e.g',
                                 # above is from reuters, below for eu-adr specifically
                                 'spp'}

    return punkt.PunktSentenceTokenizer(punkt_params)

Ejemplo n.º 10

0

Mostrar archivo

Archivo: preprocess.py Proyecto: douglascook/bio_relex

def set_up_tokenizer():
    """
    Set up sentence splitter with custom parameters and return to caller
    """
    punkt_params = punkt.PunktParameters()
    # sentences are not split ending on the given parameters, using {} creates a set literal
    punkt_params.abbrev_types = {
        'inc', '.tm', 'tm', 'no', 'i.v', 'dr', 'drs', 'u.s', 'u.k', 'ltd',
        'vs', 'vol', 'corp', 'jan', 'feb', 'mar', 'apr', 'jun', 'jul', 'aug',
        'sep', 'sept', 'oct', 'nov', 'dec', 'pm', 'p.m', 'am', 'a.m', 'mr',
        'mrs', 'ms', 'i.e'
    }
    # the tokenizer has to be unpickled so better do it once here than every time it is used
    return punkt.PunktSentenceTokenizer(punkt_params)

Ejemplo n.º 11

0

Mostrar archivo

    def read_all(self):
        """A wrapper to read all abstracts and annotations
        """
        self.train_txt = self.read_abstracts(self._train_text_path)
        self.dev_txt = self.read_abstracts(self._dev_text_path)
        self.eval_txt = self.read_abstracts(self._eval_text_path)
        print(f"Finished reading abstracts.\n# of sentences read: Train: {len(self.train_txt)}, Dev: {len(self.dev_txt)}, Eval: {len(self.eval_txt)}")
        
        self.train_anno = self.read_annotations(self._train_label_path)
        self.dev_anno = self.read_annotations(self._dev_label_path)
        self.eval_anno = self.read_annotations(self._eval_label_path)
        print(f"Finished reading annotations")

        self.punkt_tokenizer = punkt.PunktSentenceTokenizer(self.all_texts)

Ejemplo n.º 12

0

Mostrar archivo

 def __init__(self, text):
     '''
     Initialise the NLTK PunktSentenceTokenizer with our custom language variable for sentence splitting.
     Initialise a RegEx pattern for use in the NLTK RegexpTokenizer.
     :param text: string of raw continuous text
     '''
     self.text = text
     self.custom_tknzr = pkt.PunktSentenceTokenizer(
         lang_vars=CustomLanguageVars.CustomLanguageVars())
     self.pattern = r'''(?x)
         \b[a-zA-Z0-9._%+-]+@\s*?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b          # Email addresses
         |\s+                   # any consecutive whitespace characters
         |\b(?:[a-zA-Z]\.)+\b   # abbreviations, e.g. U.S.A.
         |\d+(?:\.\d+)?%?       # numbers, incl. percentages
         |\w+(?:[-']\w+)*       # words with optional internal hyphen/apostrophe
         |[.,;:!?"'()\[\]]      # specific punctuation characters
         |\S+                   # any consecutive non-whitespace characters
         '''
     self.regex_tknzr = ret.RegexpTokenizer(self.pattern)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: sentenceSegmentation.py Proyecto: nik299/nlp

    def punkt(self, text):
        """
		Sentence Segmentation using the Punkt Tokenizer

		Parameters
		----------
		text : str
			A string (a bunch of sentences)

		Returns
		-------
		list
			A list of strings where each strin is a single sentence
		"""
        sent_splitter = punkt.PunktSentenceTokenizer()
        segmented_text = sent_splitter.tokenize(text)

        # Fill in code here

        return segmented_text

Ejemplo n.º 14

0

Mostrar archivo

def getContext(allNodes, links):
    """
    Gets the sentence before and the sentence after each citation
    """
    trainer = punkt.PunktSentenceTokenizer()
    trainer.train("data/real_estate.txt"
                  )  #Sentence fragmenter trained on real_estate (arbitrarily)
    for node, srcIndex in zip(allNodes, range(0, len(allNodes))):
        name = node.get("name")
        tokens = trainer.tokenize(node.get("text"))
        for link in links:
            if link.get("source") == srcIndex:
                target = allNodes[link.get("target")].get("name")
                for sentence, sentIndex in zip(tokens, range(0, len(tokens))):
                    if target in sentence:
                        prevSent = ''
                        i = 1
                        while len(
                                prevSent.split(' ')
                        ) < 10:  #If the previous sentence was too short, add more
                            try:
                                prevSent = tokens[sentIndex - i] + prevSent
                            except:
                                break
                            i += 1
                        print 'PREV SENTENCE: ' + prevSent
                        print 'CURRENT SENTENCE: ' + sentence
                        nextSent = ''
                        while len(nextSent.split(
                                ' ')) < 10:  #Same with the next sentence
                            try:
                                nextSent += tokens[sentIndex + i]
                            except:
                                break
                            i += 1
                        print 'NEXT SENTENCE: ' + nextSent

Ejemplo n.º 15

0

Mostrar archivo

Archivo: perceptron_ranker_full.py Proyecto: xm83/wikipedia

    def __init__(self, name="", features=None):
        """ Initializes a feature set. """

        # Load various libraries / dictionaries if they haven't been
        if FeatureSet.pronouns is None:
            FeatureSet.pronouns = loadDictionary(PRONOUN_FILENAME)
        if FeatureSet.words is None:
            FeatureSet.words = loadDictionary(DICT_FILENAME)
        if FeatureSet.stop_words is None:
            FeatureSet.stop_words = loadDictionary(STOP_FILENAME)
        if FeatureSet.st is None:
            # FeatureSet.st = punkt.PunktSentenceTokenizer(gutenberg.raw(gutenberg.files()))
            FeatureSet.st = punkt.PunktSentenceTokenizer()
        if FeatureSet.wt is None:
            FeatureSetwt = punkt.PunktWordTokenizer()

        # predefined set of features?
        if features is None:
            self.features = {}
        else:
            self.features = features
            
        # article name
        self.name = name

Ejemplo n.º 16

0

Mostrar archivo

#!/usr/bin/env python3
# punktgen.py

import sys, pickle
from nltk.tokenize import punkt

if len(sys.argv) < 3:
    print("Usage: %s infile outfile" % sys.argv[0])
    sys.exit()

tk = punkt.PunktSentenceTokenizer()
inf = open(sys.argv[1])
outf = open(sys.argv[2], 'wb')

tk.train(inf.read())
pickle.dump(tk, outf)

outf.close()
inf.close()
print(sys.argv[2] + " saved.")

Ejemplo n.º 17

0

Mostrar archivo

Archivo: splitter.py Proyecto: wanlinxie/dissertation

#! /usr/bin/env python
# Author: Kapil Thadani ([email protected])

from __future__ import division, with_statement
from nltk.tokenize import punkt

punkt_splitter = punkt.PunktSentenceTokenizer()

# Suffixes observed to frequently cause incorrect splits
# (derived from observations in parsing WikiNews).
bad_suffixes = [
    'Mr.',
    'Ms.',
    'Mrs.',
    'Dr.',
    'Lt.',
    'Sgt.',
    'Maj.',
    #                'Col.',
    #                'Gen.',
    #                'Adm.',
    'Sen.',
    'Rep.',
    'U.S.',
    'U.N.',
    'U.K.',
    'E.U.',
    'Jan.',
    'Feb.',
    'Mar.',
    'Apr.',

Ejemplo n.º 18

0

Mostrar archivo

def build():

    #this custom tokenizer doesn't handle abbrev as well, need to add it:
    my_punkt_param = PunktParameters()
    my_punkt_param.abbrev_types = set([
        'dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'd.c', 'a.d', 'b.c', 'r.s.v.p',
        'p.s', 'a.s.a.p', 'e.t.a', 'd.i.y', 'r.i.p', 'e.g'
    ])

    my_sent_tokenizer = pkt.PunktSentenceTokenizer(lang_vars=MyLanguageVars(),
                                                   train_text=my_punkt_param)

    reader = MyCorpusReader("flask_query_api/api/query/files/",
                            ".*\.txt",
                            para_block_reader=my_read_blankline_block,
                            sent_tokenizer=my_sent_tokenizer)

    #the "\n" (or, "%0A") passed from url
    #Try to add line into dictionary

    lineno = 1
    occurrences = []
    keys = []
    values = []

    for para in reader.paras():

        if not para:

            lineno += 1
        else:
            #keeping track of the column position of the last sentence in the last paragraph.
            col_pos = 0

            for sent in para:

                count_line = 0

                sent_no_linebreak = re.sub(r'\n|\r', ' ', sent)

                lines = sent.split('\n')

                for line in lines:

                    if count_line != 0:
                        lineno += 1
                        col_pos = len(line)

                        for length in range(1, len(line) + 1):
                            for start_pos in range(0, len(line) - length + 1):
                                key = line[start_pos:start_pos + length]
                                value = (lineno, start_pos + 1,
                                         start_pos + length + 1,
                                         bytearray(sent_no_linebreak.strip(),
                                                   'utf-8'))

                                keys.append(key)
                                values.append(
                                    value)  #(line,start,end,in_sentence)

                    else:

                        for length in range(1, len(line) + 1):

                            for start_pos in range(0, len(line) - length + 1):

                                key = line[start_pos:start_pos + length]
                                value = (lineno, start_pos + 1 + col_pos,
                                         start_pos + length + 1 + col_pos,
                                         bytearray(sent_no_linebreak.strip(),
                                                   'utf-8'))

                                keys.append(key)
                                values.append(
                                    value)  #(line,start,end,in_sentence)

                        col_pos += len(line)

                    count_line += 1

            lineno += 2

    value_format = ">LLL512s"
    data = zip(keys, values)
    dictionary = marisa_trie.RecordTrie(value_format, data)
    #dictionary = dawg.RecordDAWG(value_format, data)

    return dictionary

Ejemplo n.º 19

0

Mostrar archivo

Archivo: app.py Proyecto: kyledemeule/ampliview

nltk.download('punkt')


class CustomLanguageVars(pkt.PunktLanguageVars):
    _period_context_fmt = r"""
        \S*                          # some word material
        %(SentEndChars)s             # a potential sentence ending
        \s*                       #  <-- THIS is what I changed
        (?=(?P<after_tok>
            %(NonWord)s              # either other punctuation
            |
            (?P<next_tok>\S+)     #  <-- Normally you would have \s+ here
        ))"""


sentence_tokenizer = pkt.PunktSentenceTokenizer(lang_vars=CustomLanguageVars())


@app.route('/analyze', methods=['POST'])
def analyze():
    review_text = request.form.get('review_text')
    if review_text and len(review_text) > 10000:
        return jsonify({
            "error":
            "Only reviews of less than 10K characters are supported."
        })
    elif review_text:
        review_lines = [
            process_line(l) for l in sentence_tokenizer.tokenize(review_text)
        ]
        # use first model for short reviews

Ejemplo n.º 20

0

Mostrar archivo

    sd = statistics.stdev(count_list)

    max_word_list = []
    for key, value in freq_tup:
        if value >= (mean_count + 1.5 * sd):
            max_word_list.append(key)
    print(max_word_list)
    return max_word_list

    print(mean_count)
    print(sd)


import nltk.tokenize.punkt as punk

punk_cl = punk.PunktSentenceTokenizer()
sent_break = punk_cl.sentences_from_text(clean_text_read)

for sentence in sent_break:
    ind = sent_break.index(sentence)
    clean = clean_word(sentence)
    del sent_break[ind]
    sent_break.insert(ind, clean)

word_read = open('combined_biology_wordlist.txt', 'r')
bio_word = word_read.readlines()
clean_bio_word = []
for i in bio_word:
    new = i.strip()
    clean_bio_word.append(new)

Ejemplo n.º 21

0

Mostrar archivo

 def __init__(self):
     """Initialize the bad prefixes and suffixes surrounding splits.
     """
     self.punkt_splitter = punkt.PunktSentenceTokenizer()
     self.whitespace = set(WHITESPACE)

Ejemplo n.º 22

0

Mostrar archivo

def clean_and_tag():
    """ Create new CSV containing all relevant sentences """

    # set filepath to input
    basepath = os.path.dirname(__file__)
    file_in = 'data/reuters/press_releases/PR_drug_company_500.csv'
    file_in = os.path.abspath(os.path.join(basepath, '..', '..', file_in))
    file_out = os.path.abspath(
        os.path.join(basepath, '..', 'reuters/sentences_POS.csv'))

    # set up sentence splitter with custom parameters
    punkt_params = punkt.PunktParameters()
    # sentences are not split ending on the given parameters, using {} creates a set literal
    punkt_params.abbrev_types = {
        'inc', 'inc ', '.tm', 'tm', 'no', 'i.v', 'drs', 'u.s'
    }
    # the tokenizer has to be unpickled so better do it once here than every time it is used
    sentence_splitter = punkt.PunktSentenceTokenizer(punkt_params)

    with open(file_in, 'rb') as csv_in:
        with open(file_out, 'wb') as csv_out:
            # TO DO use dictionary reader to avoid using magic numbers for columns
            csv_reader = csv.reader(csv_in, delimiter=',')
            csv_writer = csv.writer(csv_out, delimiter=',')

            # write column headers on first row
            row = csv_reader.next()
            row.append('POS TAGS')
            csv_writer.writerow(row)

            for row in csv_reader:
                # use stdout to avoid spaces and newlines
                sys.stdout.write('.')
                # need to flush the buffer to display immediately
                sys.stdout.flush()

                # clean up html tags
                plaintext = nltk.clean_html(row[1])
                drug = row[3]
                company = row[5]
                src = row[0]

                # only consider texts containing both the drug and company
                if drug in plaintext and company in plaintext:
                    sentences = sentence_splitter.tokenize(plaintext)

                    # filter for only sentences mentioning drug, company or both
                    # TO DO coreference resolution to find more relevant sentences
                    sentences = [
                        s for s in sentences if drug in s or company in s
                    ]

                    if len(sentences) > 0:
                        for s in sentences:
                            # remove punctuation, still want to add original sentence to CSV though
                            no_punct = re.findall(r'[\w\$\xc2()-]+', s)
                            no_punct = ' '.join(no_punct)
                            tokens = nltk.word_tokenize(no_punct)
                            tags = nltk.pos_tag(tokens)

                            # TO DO parse tree info, something to do with stemming?
                            # write row to file for each sentence
                            row.append(tags)
                            csv_writer.writerow(
                                [src, s, row[2], drug, row[4], company, tags])

Ejemplo n.º 23

0

Mostrar archivo

def text_search_sequential(search_term):

    #this custom tokenizer doesn't handle abbrev as well, need to add it:
    my_punkt_param = PunktParameters()
    my_punkt_param.abbrev_types = set([
        'dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'd.c', 'a.d', 'b.c', 'r.s.v.p',
        'p.s', 'a.s.a.p', 'e.t.a', 'd.i.y', 'r.i.p', 'e.g'
    ])

    my_sent_tokenizer = pkt.PunktSentenceTokenizer(lang_vars=MyLanguageVars(),
                                                   train_text=my_punkt_param)

    reader = MyCorpusReader("flask_query_api/api/query/files/",
                            ".*\.txt",
                            para_block_reader=my_read_blankline_block,
                            sent_tokenizer=my_sent_tokenizer)

    #the "\n" (or, "%0A") passed from url
    search_two_lines = re.search(r'\n', search_term)

    lineno = 1
    occurrences = []

    for para in reader.paras():

        if not para:
            lineno += 1
        else:
            #keeping track of the column position of the last sentence in the last paragraph.
            col_pos = 0

            for sent in para:
                count_line = 0

                sent_no_linebreak = re.sub(r'\n|\r', ' ', sent)

                results_sent_list = []

                for m in re.finditer(re.escape(search_term), sent):
                    results_sent_list.append({
                        'start':
                        m.start() + 1,
                        'end':
                        m.end() + 1,
                        'in_sentence':
                        sent_no_linebreak.strip()
                    })

                result = None
                if results_sent_list:
                    result = results_sent_list.pop(0)

                sent_pos = 0
                lines = sent.split('\n')
                for line in lines:

                    if count_line != 0:
                        lineno += 1
                        col_pos = len(line)

                        # use while because may have multiple matches per line
                        while result and (sent_pos + len(line) + count_line >
                                          result['start']):
                            result['line'] = lineno
                            result['start'] -= (sent_pos + count_line)
                            result['end'] -= (sent_pos + count_line)

                            if search_two_lines:
                                result['end'] -= (len(line) + 1)

                            occurrences.append(result)
                            if results_sent_list:
                                result = results_sent_list.pop(0)
                            else:
                                result = None

                        sent_pos += len(line)
                    else:

                        while result and (len(line) > result['start']
                                          ):  #search found on current line
                            result['line'] = lineno
                            result['start'] += col_pos
                            result['end'] += col_pos

                            if search_two_lines:
                                result['end'] -= (len(line) + 1)

                            occurrences.append(result)
                            if results_sent_list:
                                result = results_sent_list.pop(0)
                            else:
                                result = None

                        col_pos += len(line)
                        sent_pos += len(line)

                    count_line += 1

            lineno += 2

    response = {
        "query_text": search_term,
        "number_of_occurrences": len(occurrences),
        "occurences": occurrences
    }
    return response

Ejemplo n.º 24

0

Mostrar archivo

 def __init__(self, dataset: List[str]):
     self.text = "\n".join(dataset)
     self.tokenizer = punkt.PunktSentenceTokenizer(train_text=self.text)