Ejemplo n.º 1
0
def split_sentences(segmented_files):
    # Create an empty list to hold the split sentences
    split_sentences = []
    # Load the appropriate gazetteers
    gazetteers = load_gazetteers()
    # For each segmented file passed to the split_sentences function...
    for file_id, file in enumerate(segmented_files):
        # Append an empty list to the list of split sentences
        split_sentences.append([])
        # For each segment in this file...
        for segment in file[2]:
            # Split the given segment into sentences based on a regex pattern - whitespace preceded by certain punctuation marks, but not by certain combinations of letters and punctuation marks or by any of the negative lookbehind assertions created for the 'abbreviations' gazetteer
            pattern = gazetteers[
                "abbreviations_regex"] + "(?<=[.|!|?])(?<!\s[A-Z][.])(?<![A-Z][.][A-Z][.])(?<![.]\s[.])(?<![.][.])[\s]"
            sentences = re.split(pattern, segment)
            # Create a variable 'k' and iterate through the split sentences...
            k = 0
            while k < len(sentences):
                # If an empty sentence is encountered, delete it
                if sentences[k] == "":
                    del sentences[k]
                else:
                    # If we are not on the last sentence...
                    if k < len(sentences) - 1:
                        # If the next sentence splits according to the regex pattern...
                        if re.match(pattern, sentences[k + 1]):
                            # Append the next sentence to the current one and then delete it
                            sentences[k] = sentences[k] + sentences[k +
                                                                    1].strip()
                            del sentences[k + 1]
                    k += 1
            # Append the split sentences to the appropriate file in the wider split sentences list
            split_sentences[file_id].append(sentences)
    # Return the list of split sentences
    return (split_sentences)
Ejemplo n.º 2
0
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License or (at your option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses>.
"""

import sys
import os
import re

import json

from cy_textsegmenter import *
from cy_sentencesplitter import *
from shared.load_gazetteers import *

gazetteers = load_gazetteers()

contractions_and_prefixes = {}
with open("{}/../cy_gazetteers/contractions_and_prefixes.json".format(
        os.path.dirname(
            os.path.abspath(__file__)))) as contractionsprefixes_json:
    contractions_and_prefixes = json.load(contractionsprefixes_json)


def remove_markup(tokens):
    """ Remove markup tags (opening, closing, or both) from tokens """
    for i, token in enumerate(tokens):
        if token[:6] == "<anon>":
            tokens[i] = token[6:]
        if token[-7:] == "</anon>":
            tokens[i] = token[:-7]
Ejemplo n.º 3
0
     process(input_text=sys.stdin.read())
 else:
     if args[0] == "evaluate":
         arguments = parse_evaluation_arguments(args)
         #evaluate(arguments.gold, arguments.cytag, soft_evaluation=arguments.soft)
     else:
         if len(args) == 1 and os.path.isfile(
                 args[0]) != True and os.path.isdir(
                     args[0]) != True and args[0].startswith("-") != True:
             process(input_text=args[0])
         else:
             arguments = parse_processing_arguments(args)
             if arguments.lexicon and arguments.lexicon == "y":
                 load_lexicon()
             if arguments.gazetteer and arguments.gazetteer == "y":
                 load_gazetteers()
             if os.path.isdir(arguments.input[0]) and len(
                     arguments.input) == 1:
                 names = next(os.walk(arguments.input[0]))[2]
                 filepaths = []
                 for fn in names:
                     fp = os.path.join(arguments.input[0], fn)
                     filepaths.append(fp)
                 filenames = filepaths
             else:
                 filenames = arguments.input
             process(filenames,
                     output_name=arguments.name,
                     directory=arguments.dir,
                     component=arguments.component,
                     output_format=arguments.format)