コード例 #1
0
             "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that",
             "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had",
             "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as",
             "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through",
             "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off",
             "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how",
             "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only",
             "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
stopwords = set(stopwords)
OLD_ENGLISH = {"thy": "your", "thou": "you", "Thy": "Your", "Thou": "You"}

# moses tokenizer
from sacremoses import MosesTruecaser, MosesTokenizer, MosesDetokenizer, MosesDetruecaser
mtok = MosesTokenizer(lang='en')
mtr = MosesTruecaser("vocab/truecase-model.en")
md = MosesDetokenizer(lang="en")
mdtr = MosesDetruecaser()

# bpe tokenizer
from subword_nmt.apply_bpe import BPE, read_vocabulary
vocabulary = read_vocabulary(codecs.open("vocab/vocab.bpe35000.chr", encoding='utf-8'), 10)
bpe = BPE(codes=codecs.open("vocab/codes_file_chr_35000", encoding='utf-8'), merges=35000, vocab=vocabulary)

# load nmt models
import onmt.opts
from translator_for_demo import build_translator
from onmt.utils.parse import ArgumentParser


def _parse_opt(opt):
    prec_argv = sys.argv
コード例 #2
0
ファイル: test_bleu_seq.py プロジェクト: MunzT/NMTVis
	with open(tgt_file, 'r') as f:
		targets = f.readlines()

	assert len(sources) == len(targets)
	print('Loaded', len(sources), 'sentences')
	return sources, targets


print("Loading vocab...")
src_vocab, tgt_vocab = d.load_vocab(src_lang, tgt_lang)
d.SRC.vocab = src_vocab
d.TGT.vocab = tgt_vocab
src_pad_key = d.SRC.vocab.stoi[d.BLANK_WORD]
tgt_pad_key = d.TGT.vocab.stoi[d.BLANK_WORD]

mtok = MosesDetokenizer(lang=tgt_lang)

print("Loading data...")
sources, targets = load_data(test_year=TEST_YEAR)

print('Loading model ...')
model = Seq2SeqModel.load(src_lang=src_lang, tgt_lang=tgt_lang, epoch=20)


print('Starting test...')

i = 0
translations = []
references = []
with torch.no_grad():
	for src_text, tgt_text in zip(sources, targets):
コード例 #3
0
ファイル: generator.py プロジェクト: yf1291/nlp4
    def __init__(self, task, models, args, src_bpe=None, bpe_symbol='@@ '):
        self.task = task
        self.models = models
        self.src_dict = task.source_dictionary
        self.tgt_dict = task.target_dictionary
        self.src_bpe = src_bpe
        self.use_cuda = torch.cuda.is_available() and not args.cpu
        self.args = args

        # optimize model for generation
        for model in self.models:
            model.make_generation_fast_(
                beamable_mm_beam_size=None if self.args.no_beamable_mm else self.args.beam,
                need_attn=args.print_alignment,
            )
            if args.fp16:
                model.half()
            if self.use_cuda:
                model.cuda()

        self.generator = self.task.build_generator(args)

        # Load alignment dictionary for unknown word replacement
        # (None if no unknown word replacement, empty if no path to align dictionary)
        self.align_dict = utils.load_align_dict(args.replace_unk)

        self.max_positions = utils.resolve_max_positions(
            self.task.max_positions(),
            *[model.max_positions() for model in models]
        )

        self.in_transforms = []
        self.out_transforms = []

        if getattr(args, 'moses', False):
            tokenizer = MosesTokenizer(lang=args.source_lang or 'en')
            detokenizer = MosesDetokenizer(lang=args.target_lang or 'en')
            self.in_transforms.append(lambda s: tokenizer.tokenize(s, return_str=True))
            self.out_transforms.append(lambda s: detokenizer.detokenize(s.split()))
        elif getattr(args, 'nltk', False):
            from nltk.tokenize import word_tokenize
            self.in_transforms.append(lambda s: ' '.join(word_tokenize(s)))

        if getattr(args, 'gpt2_bpe', False):
            from fairseq.gpt2_bpe.gpt2_encoding import get_encoder
            encoder_json = os.path.join(os.path.dirname(src_bpe), 'encoder.json')
            vocab_bpe = src_bpe
            encoder = get_encoder(encoder_json, vocab_bpe)
            self.in_transforms.append(lambda s: ' '.join(map(str, encoder.encode(s))))
            self.out_transforms.append(lambda s: ' '.join(t for t in s.split() if t != '<unk>'))
            self.out_transforms.append(lambda s: encoder.decode(map(int, s.strip().split())))
        elif getattr(args, 'sentencepiece', False):
            import sentencepiece as spm
            sp = spm.SentencePieceProcessor()
            sp.Load(src_bpe)
            self.in_transforms.append(lambda s: ' '.join(sp.EncodeAsPieces(s)))
            self.out_transforms.append(lambda s: data_utils.process_bpe_symbol(s, 'sentencepiece'))
        elif src_bpe is not None:
            bpe_parser = apply_bpe.create_parser()
            bpe_args = bpe_parser.parse_args(['--codes', self.src_bpe])
            bpe = apply_bpe.BPE(bpe_args.codes, bpe_args.merges, bpe_args.separator, None, bpe_args.glossaries)
            self.in_transforms.append(lambda s: bpe.process_line(s))
            self.out_transforms.append(lambda s: data_utils.process_bpe_symbol(s, bpe_symbol))
コード例 #4
0
ファイル: base_inflect.py プロジェクト: salesforce/bite
class BITETokenizer(object):
    inflection_tokens = [
        "[JJR]", "[JJS]", "[NNS]", "[NNPS]", "[RBR]", "[RBS]", "[VBD]",
        "[VBG]", "[VBN]", "[VBP]", "[VBZ]"
    ]
    single_char_map = {
        "[JJR]": chr(9774),
        "[JJS]": chr(9775),
        "[NNS]": chr(9776),
        "[NNPS]": chr(9777),
        "[RBR]": chr(9778),
        "[RBS]": chr(9779),
        "[VBD]": chr(9780),
        "[VBG]": chr(9781),
        "[VBN]": chr(9782),
        "[VBP]": chr(9783),
        "[VBZ]": chr(9784)
    }
    reverse_single_char_map = {v: k for k, v in single_char_map.items()}
    lemma_tags = {'NN', 'VB', 'JJ', 'RB', 'MD', "NNP"}
    have_inflections = {'NOUN', 'ADJ', 'VERB'}

    def __init__(self, pretokenizer='moses'):
        self.tagger = PerceptronTagger()
        self.pretok_type = pretokenizer
        if pretokenizer == 'bertpretokenizer':
            self.pretokenizer = BertPreTokenizer()
        elif pretokenizer == 'moses':
            self.pretokenizer = MosesTokenizer()
            self.detokenizer = MosesDetokenizer()
        elif pretokenizer == 'whitespace':
            pass
        else:
            raise ValueError(
                "pretokenizer must be 'bertpretokenizer', 'moses', or 'whitespace'."
            )

    def _pretokenize(self, sentence: str) -> List[str]:
        if self.pretok_type == 'bertpretokenizer':
            return [tup[0] for tup in self.pretokenizer.pre_tokenize(sentence)]
        elif self.pretok_type == 'whitespace':
            return sentence.split()
        else:
            return self.pretokenizer.tokenize(sentence)

    def tokenize(self,
                 sentence: Union[str, List[str]],
                 pretokenize: bool = True,
                 map_to_single_char: bool = False) -> List[str]:
        if pretokenize:
            pretokenized = self._pretokenize(sentence)
        else:
            # Allow users to pass in a list of tokens if using custom pretokenizers
            pretokenized = sentence
        ptb_pos_tagged = self.tagger.tag(pretokenized)
        universal_pos_tagged = [(token, map_tag("en-ptb", 'universal', tag))
                                for (token, tag) in ptb_pos_tagged]
        tokenized = []
        for i, (word, pos) in enumerate(ptb_pos_tagged):
            if universal_pos_tagged[i][
                    1] in self.have_inflections and word not in (
                        string.punctuation +
                        '—') and pos not in self.lemma_tags:
                lemma = getLemma(word, upos=universal_pos_tagged[i][1])[0]
                if not lemma:
                    lemma = word
                tokenized.append(lemma)
                tokenized.append('[' + pos + ']')
            else:
                tokenized.append(word)
        if map_to_single_char:
            tokenized = [
                self.single_char_map[token]
                if token in self.inflection_tokens else token
                for token in tokenized
            ]
        return tokenized

    def detokenize(self,
                   tokens: List[str],
                   as_list: bool = False) -> Union[str, List[str]]:
        result = []
        for i, token in enumerate(tokens):
            # combine wordpiece tokens
            if token in self.reverse_single_char_map:
                token = self.reverse_single_char_map[token]
            if token in self.inflection_tokens:
                if i != 0:
                    inflected = getInflection(result[-1], tag=token[1:-1])
                    if inflected:
                        result[-1] = inflected[0]
            else:
                result.append(token)

        if as_list:
            # Allow users to detokenize using their own detokenizers
            return result
        if self.pretok_type == 'moses':
            return self.detokenizer.detokenize(result)
        return ' '.join(result)
コード例 #5
0
ファイル: utils.py プロジェクト: Natithan/DIRT
import torch
import jsondiff

from allennlp.common.checks import ConfigurationError
from allennlp.common.params import Params
from sacremoses import MosesDetokenizer

from .config import Params

logger = logging.getLogger(__name__)  # pylint: disable=invalid-name

SOS_TOK, EOS_TOK = "<SOS>", "<EOS>"

# Note: using the full 'detokenize()' method is not recommended, since it does
# a poor job of adding correct whitespace. Use unescape_xml() only.
_MOSES_DETOKENIZER = MosesDetokenizer()


def get_output_attribute(out, attribute_name, cuda_device, reduction="sum"):
    """
    This function handles processing/reduction of output for both
    DataParallel or non-DataParallel situations.
    For the case of multiple GPUs, This function will
    sum all replacers for a certain output attribute in various batches
    together.

    Parameters
    ---------------------
    :param out: Dictionary, output of model during forward pass,
    :param attribute_name: str,
    :param cuda_device: list or int
コード例 #6
0
class Tokenizer(object):
    def __init__(self,
                 vocab_file=None,
                 additional_tokens=None,
                 use_moses=None):
        self.special_tokens = [PAD_TOKEN, UNK_TOKEN, BOS_TOKEN, EOS_TOKEN]
        if use_moses is not None:
            self.enable_moses(lang=use_moses)
        if additional_tokens is not None:
            self.special_tokens += additional_tokens
        self.__word2idx = {}
        self.vocab_file = vocab_file
        if os.path.isfile(vocab_file):
            self.load_vocab(vocab_file)

    def enable_moses(self, lang='en', tokenize=True, detokenize=True):
        if tokenize:
            self._moses_tok = MosesTokenizer(lang=lang)
        else:
            self._moses_tok = None

        if detokenize:
            self._moses_detok = MosesDetokenizer(lang=lang)
        else:
            self._moses_detok = None

    @property
    def vocab_size(self):
        return len(self.vocab) + len(self.special_tokens)

    def pre_tokenize(self, line):
        if hasattr(self, '_moses_tok'):
            return self._moses_tok.tokenize(line, return_str=True)
        return line

    def post_detokenize(self, tokens):
        if hasattr(self, '_moses_detok'):
            return self._moses_detok.detokenize(tokens, return_str=False)
        return tokens

    def idx2word(self, idx):
        if idx < len(self.special_tokens):
            return self.special_tokens[idx]
        else:
            return self.vocab[idx - len(self.special_tokens)][0]

    def update_word2idx(self):
        self.__word2idx = {
            word[0]: idx + len(self.special_tokens)
            for idx, word in enumerate(self.vocab)
        }
        for i, tok in enumerate(self.special_tokens):
            self.__word2idx[tok] = i

    def word2idx(self, word):
        return self.__word2idx.get(word, UNK)

    def segment(self, line, sample=None):
        """segments a line to tokenizable items"""
        line = self.pre_tokenize(line)
        return _segment_words(line)

    def get_vocab(self, item_list, from_filenames=True, limit=None):
        vocab = _get_vocabulary(item_list=item_list,
                                segment=self.segment,
                                from_filenames=from_filenames)
        self.vocab = vocab.most_common(limit)
        self.update_word2idx()

    def save_vocab(self, vocab_filename):
        if self.vocab is not None:
            with codecs.open(vocab_filename, 'w', encoding='UTF-8') as f:
                for (key, freq) in self.vocab:
                    f.write("{0} {1}\n".format(key, freq))

    def load_vocab(self, vocab_filename, limit=None, min_count=1):
        vocab = OrderedCounter()
        with codecs.open(vocab_filename, encoding='UTF-8') as f:
            for line in f:
                try:
                    word, count = line.strip().split()
                except:  # no count
                    word, count = line.strip(), 1
                count = int(count)
                if count >= min_count:
                    vocab[word] = count
        self.vocab = vocab.most_common(limit)
        self.update_word2idx()

    def tokenize(self, line, insert_start=None, insert_end=None, sample=None):
        """tokenize a line, insert_start and insert_end are lists of tokens"""
        inputs = self.segment(line)
        targets = []
        if insert_start is not None:
            targets += insert_start
        for w in inputs:
            targets.append(self.word2idx(w))
        if insert_end is not None:
            targets += insert_end
        return torch.LongTensor(targets)

    def detokenize(self, inputs, delimiter=u' '):
        token_list = [self.idx2word(int(idx)) for idx in inputs]
        token_list = self.post_detokenize(token_list)
        outputs = delimiter.join(token_list)
        return outputs
コード例 #7
0
def load_model(model_dir, bpe_src_code=None, tokenize=None):
    """
    Start the bot. This means loading the model according to the config file.

    :param model_dir: Model directory of trained Joey NMT model.
    :param bpe_src_code: BPE codes for source side processing (optional).
    :param tokenize: If True, tokenize inputs with Moses tokenizer.
    :return:
    """
    conf = {}
    cfg_file = model_dir + "/config.yaml"

    logger = logging.getLogger(__name__)
    conf["logger"] = logger
    # load the Joey configuration
    cfg = load_config(cfg_file)

    # load the checkpoint
    if "load_model" in cfg['training'].keys():
        ckpt = cfg['training']["load_model"]
    else:
        ckpt = get_latest_checkpoint(model_dir)
        if ckpt is None:
            raise FileNotFoundError("No checkpoint found in directory {}."
                                    .format(model_dir))

    # prediction parameters from config
    conf["use_cuda"] = cfg["training"].get("use_cuda", False)
    conf["level"] = cfg["data"]["level"]
    conf["max_output_length"] = cfg["training"].get("max_output_length", None)
    conf["lowercase"] = cfg["data"].get("lowercase", False)

    # load the vocabularies
    src_vocab_file = cfg["training"]["model_dir"] + "/src_vocab.txt"
    trg_vocab_file = cfg["training"]["model_dir"] + "/trg_vocab.txt"
    conf["src_vocab"] = build_vocab(field="src", vocab_file=src_vocab_file,
                                    dataset=None, max_size=-1, min_freq=0)
    conf["trg_vocab"] = build_vocab(field="trg", vocab_file=trg_vocab_file,
                                    dataset=None, max_size=-1, min_freq=0)

    # whether to use beam search for decoding, 0: greedy decoding
    if "testing" in cfg.keys():
        conf["beam_size"] = cfg["testing"].get("beam_size", 0)
        conf["beam_alpha"] = cfg["testing"].get("alpha", -1)
    else:
        conf["beam_size"] = 1
        conf["beam_alpha"] = -1

    # pre-processing
    if tokenize is not None:
        src_tokenizer = MosesTokenizer(lang=cfg["data"]["src"])
        trg_tokenizer = MosesDetokenizer(lang=cfg["data"]["trg"])
        # tokenize input
        def tokenizer(x): return src_tokenizer.tokenize(x, return_str=True)
        def detokenizer(x): return trg_tokenizer.detokenize(
            x.split(), return_str=True)
    else:
        def tokenizer(x): return x
        def detokenizer(x): return x

    if bpe_src_code is not None and level == "bpe":
        # load bpe merge file
        merge_file = open(bpe_src_code, "r")
        bpe = apply_bpe.BPE(codes=merge_file)
        def segmenter(x): return bpe.process_line(x.strip())
    elif conf["level"] == "char":
        # split to chars
        def segmenter(x): return list(x.strip())
    else:
        def segmenter(x): return x.strip()

    conf["preprocess"] = [tokenizer, segmenter]
    conf["postprocess"] = [detokenizer]
    # build model and load parameters into it
    model_checkpoint = load_checkpoint(ckpt, conf["use_cuda"])
    model = build_model(
        cfg["model"],
        src_vocab=conf["src_vocab"],
        trg_vocab=conf["trg_vocab"])
    model.load_state_dict(model_checkpoint["model_state"])

    if conf["use_cuda"]:
        model.cuda()
    conf["model"] = model
    print("Joey NMT model loaded successfully.")
    return conf
コード例 #8
0
class LegalDoc:

    # ----Constant Static fields----
    # Patterns
    __SENTENCING_IDENTIFIER_PATTERN = re.compile(r".+(DATE OF SENTENCE:)",
                                                 re.S | re.M)
    __EMPTY_LINE_PATTERN = re.compile(r"^[\s\t\n\r]*$")
    __FILE_SECTION_PATTERN = re.compile(r"Section:[0-9]+")
    __SECTION_PATTERN = re.compile(r"([0-9]+)[.(\s\t]*([A-Z].+)")
    __DOCUMENT_PATTERN = re.compile(
        r"(.+?)" +  # Head
        r"("  # Capture Body start
        r"(?:^1[.\t\s]*[A-Z])" +  # 1st section number and 1st capital letter
        r"(?:.+)"  # Body sans above capture
        r")",  # Capture Body End
        re.S | re.M)

    __CASE_NUMBER_PATTERN = re.compile(
        r"^.+" + r"(?:" + r"(?:AP|CR)" +  # E.g. "CR"
        r"|" + r"(?:Case No(?:[.\s\tA-Za-z]*))" +  # E.g. "Case No. X"
        r")" + r"([0-9\s-]+[0-9])" +  # E.g. "-12-34567"
        r"[\s\t]*$",
        +re.S | re.M)
    __DEFENDANT_NAME_PATTERN = re.compile(
        r".+" + r"(?:^\|[\s]+[vV][\s]+\|$)" +  # E.g. "| v |"
        r"(?:[-\s]+)" +  # E.g. "------"
        r"^\|[\s]+"  # E.g. "|  "
        r"([A-Za-z\s-]+)",  # E.g. "John Smith"
        re.S | re.M)

    __JUDGE_NAME_PATTERN = re.compile(
        r".+JUDGE:[\s|]+" +  # E.g. "JUDGE: |:
        r"(?:(?:HIS|HER)[\s].+[\s]+JUDGE?)?" +  # E.g. "HIS HONOUR CHIEF JUDGE"
        r"([a-zA-Z\s.']+)\|",  # E.g. "J. Smith"
        re.S | re.M | re.I)

    NAME_SIMPLIFIER_PATTERN = re.compile(r"([A-Z][A-Za-z]+)")
    SECTION_IDENTIFIER_PATTERN = re.compile(r"(SECTIONSTART[0-9]+:?\s?)")

    # Settings
    ANONYMIZE_NAMES: bool = False
    CLEAN_DATA: bool = False
    REMOVE_PUNCTUATION: bool = False
    REMOVE_STOP_WORDS: bool = False
    APPLY_STEMMING: bool = False
    APPLY_LEMMATIZATION: bool = False
    TO_LOWER_CASE: bool = False

    # ~95% success rate if False but ~20% of LegalDocs will have
    # a generated case number and/or the sections will lack structure.
    # Otherwise, ~75% success rate if True

    EXIT_IF_ERRORS: bool = True

    # Singletons
    MONTHS = [
        "January", "February", "March", "April", "May"
        "June", "July", "August", "September", "October", "November",
        "December"
    ]
    SENTENCE_TOKENIZER = nltk.data.load('tokenizers/punkt/english.pickle')
    STOP_WORDS = set(stopwords.words("english"))
    STEMMER = PorterStemmer()
    LEMMATIZER = WordNetLemmatizer()
    DETOKENIZER = MosesDetokenizer()
    TFIDF_VECTORIZER = TfidfVectorizer()
    NAMES = sorted(
        list(
            set(
                nltk.corpus.names.words('male.txt') +
                nltk.corpus.names.words('female.txt'))))

    # ---- Static fields----
    s_successful_init_count: int = 0
    s_failed_init_count: int = 0
    s_exception_data_dict: dict = dict()
    s_legal_doc_dict: dict = dict()

    s_file_error: int = 0
    s_case_error: int = 0
    s_judge_error: int = 0
    s_defendant_error: int = 0
    s_sentencing_error: int = 0
    s_section_error: int = 0
    s_body_error: int = 0
    s_parsing_error: int = 0
    s_unknown_error: int = 0

    # ----Constructor----
    def __init__(self):

        # Initialise fields
        self.__f_path = "NULL"
        self.__f_file_name = "NULL"
        self.__f_head = "NULL"
        self.__f_body = []
        self.__f_case_number = "NULL"
        self.__f_judge_name = "NULL"
        self.__f_defendant_name = "NULL"

        self.__f_corpus = []

        self.__f_sentencing_document = False
        self.__f_parsing_error = False
        self.__f_punctuation_removed = False
        self.__f_lower_case = False
        self.__f_stop_words_removed = False
        self.__f_stemmed = False
        self.__f_lemmatized = False
        self.__f_contains_errors = False
        self.__f_tokenized_sentences = False

    # ----Instance methods----
    # Initialise
    def initialise(self, a_path, load_state):
        """"
        Initialises a LegalDoc instance from a file
        Separated from __init__ to avoid exceptions in the constructor
        This method must be executed after construction
        :param str a_path: The path to a legal document
        :param bool load_state: Whether the provided path points to a formatted file (true) or an unformatted file (false)
        :rtype: bool
        :return: Whether an instance was successfully generated from file at the provided path
        """

        # Initialise path
        self.__f_path = a_path

        # Get file name
        head, tail = ntpath.split(self.path)
        self.__f_file_name = (tail or ntpath.basename(head))

        # Read in file
        l_file = None
        try:
            l_file = open(self.path)
            l_file_content = l_file.read()
            l_file.close()

        # Handle file error
        except IOError:
            LegalDoc.__note_exception(self.path,
                                      "MAJOR ERROR: Unable to read file", True)
            l_file.close()
            return False

        # Load state from a formatted file
        if load_state:

            # TODO - Timer start
            Timers.s_init_load_state_timer.start()

            l_succeeded = self.__initialise_load_state(l_file_content)

            # TODO - Timer stop
            Timers.s_init_load_state_timer.stop()

            if not l_succeeded:
                LegalDoc.s_file_error += 1
                return False

        # Generate state from an unformatted file
        else:

            # TODO - Timer start
            Timers.s_init_gen_state_timer.start()

            l_succeeded = self.__initialise_generate_state(l_file_content)

            # TODO - Timer stop
            Timers.s_init_gen_state_timer.stop()

            if not l_succeeded:
                return False
            self.__f_punctuation_removed = LegalDoc.REMOVE_PUNCTUATION
            self.__f_lower_case = LegalDoc.TO_LOWER_CASE
            self.__f_stop_words_removed = LegalDoc.REMOVE_STOP_WORDS
            self.__f_stemmed = LegalDoc.APPLY_STEMMING
            self.__f_lemmatized = LegalDoc.APPLY_LEMMATIZATION

        # Note successful initialisation
        LegalDoc.s_successful_init_count += 1

        # Add current LegalDoc to static dictionary of LegalDocs
        LegalDoc.s_legal_doc_dict[self.file_name] = self

        # Create judge and add it to static dictionary of judges
        Judge.add_legal_doc(self)

        return True

    # Load state from a formatted file
    def __initialise_load_state(self, a_file_content):
        """"
        Initialises a LegalDoc instance from a formatted file
        :type a_file_content: str
        :rtype: bool
        :return: Whether an instance was successfully generated from the source file content
        """

        try:
            l_lines = a_file_content.splitlines()
            i = 0

            # Verify this is a formatted LegalDOc
            if l_lines[0] == "FIELD DATA:":
                i += 1

                # Read in field data
                while l_lines[i] != "SECTIONS:":
                    l_line = l_lines[i].strip()

                    # File name
                    if l_line == "FILE NAME:":
                        self.__f_file_name = l_lines[i + 1].strip()
                        i += 2
                        continue

                    # Case number
                    if l_line == "CASE NUMBER:":
                        self.__f_case_number = l_lines[i + 1].strip()
                        i += 2
                        continue

                    # Judge name
                    if l_line == "JUDGE NAME:":
                        self.__f_judge_name = l_lines[i + 1].strip()
                        i += 2
                        continue

                    # Defendant name
                    if l_line == "DEFENDANT NAME:":
                        self.__f_defendant_name = l_lines[i + 1].strip()
                        i += 2
                        continue

                    # Sentencing document
                    if l_line == "PRISON DOCUMENT:":
                        self.__f_sentencing_document = ast.literal_eval(
                            l_lines[i + 1].strip())
                        i += 2
                        continue

                    # Punctuation removed
                    if l_line == "PUNCTUATION REMOVED:":
                        self.__f_punctuation_removed = ast.literal_eval(
                            l_lines[i + 1].strip())
                        i += 2
                        continue

                    # Stop words removed
                    if l_line == "STOP WORDS REMOVED:":
                        self.__f_stop_words_removed = ast.literal_eval(
                            l_lines[i + 1].strip())
                        i += 2
                        continue

                    # Lower case
                    if l_line == "LOWER CASE:":
                        self.__f_lower_case = ast.literal_eval(
                            l_lines[i + 1].strip())
                        i += 2
                        continue

                    # Stemmed
                    if l_line == "STEMMED:":
                        self.__f_stemmed = ast.literal_eval(l_lines[i +
                                                                    1].strip())
                        i += 2
                        continue

                    # Lemmatized
                    if l_line == "LEMMATIZED:":
                        self.__f_lemmatized = ast.literal_eval(
                            l_lines[i + 1].strip())
                        i += 2
                        continue

                    # Contains errors
                    if l_line == "CONTAINS ERRORS:":
                        self.__f_contains_errors = ast.literal_eval(
                            l_lines[i + 1].strip())
                        i += 2
                        continue

                    # Skip line
                    i += 1

                # Read in section data
                l_section_index = -1
                i += 1
                while i < len(l_lines):
                    l_line = l_lines[i].strip()

                    # Reading a section heading
                    if LegalDoc.__FILE_SECTION_PATTERN.match(l_line):
                        l_section_index += 1
                        self.__f_body.append([])

                    # Reading a section's contents
                    else:
                        self.__f_body[l_section_index].append(l_line)
                    i += 1

            # Create corpora
            for l_section in self.body:
                for l_sentence in l_section:
                    self.__f_corpus += word_tokenize(l_sentence)

            return True
        except IndexError:
            LegalDoc.__note_exception(
                self.path,
                "MAJOR ERROR: Failed to import formatted file, index out of bounds",
                True)
            return False

    # Generate state from an unformatted file
    def __initialise_generate_state(self, a_file_content):
        """"
        Initialises a LegalDoc instance from an unformatted file
        :type a_file_content: str
        :rtype: bool
        :return: Whether an instance was successfully generated from the source file content
        """

        try:

            # Break up document into base components
            l_document_match = LegalDoc.__DOCUMENT_PATTERN.match(
                a_file_content)
            if l_document_match:
                l_document_groups = l_document_match.groups()

            # Handle document parsing error
            else:
                LegalDoc.__note_exception(
                    self.path, "MAJOR ERROR: Regex cannot parse document",
                    True)
                LegalDoc.s_parsing_error += 1
                return False

            # Extract head
            self.__f_head = l_document_groups[0]

            # Extract sentencing identifier
            if not self.__extract_sentencing_identifier():
                return False

            # Extract case number
            if not self.__extract_case_number():
                return False

            # Extract defendant's name
            if not self.__extract_defendant_name():
                return False

            # Extract judge's name
            if not self.__extract_judge_name():
                return False

            # Group lines into sections
            l_lines = l_document_groups[1].splitlines(
            )  # Body broken down by line
            if not self.__group_lines_into_sections(l_lines):
                return False

            # Anonymize names
            self.__anonymize_names()

            # Clean sections
            self.__clean_sections()

            # Initialisation completed with no errors
            return True

        # Handle miscellaneous errors
        except Exception:
            LegalDoc.__note_exception(
                self.path, "MAJOR ERROR: Unspecified error occurred", True)
            LegalDoc.s_unknown_error += 1
            raise
            return False

    # Anonymize names
    def __anonymize_names(self):
        """"
        Change's every instance of the defendant's name to "Defandant"
        Encrypts the judge's name
        Assigns a random name to everybody else
        """

        if LegalDoc.ANONYMIZE_NAMES:
            # TODO - Timer start
            Timers.s_anonymize_names_timer.start()

            # Generate corpus
            self.generate_corpus_from_sections()

            # TODO - Timer start
            Timers.s_anonymization_timer.start()

            # Get list of names
            l_filtered_corpus = [w for w in self.corpus if w[0].isupper()]
            # print(l_filtered_corpus)

            # print(l_filtered_corpus)
            # l_names = [w for w in LegalDoc.NAMES if w in l_filtered_corpus]

            l_names = []
            for w in l_filtered_corpus:
                i = LegalDoc.index(LegalDoc.NAMES, w)
                if i is not None:
                    l_names.append(LegalDoc.NAMES[i])

            # TODO - Timer stop
            Timers.s_anonymization_timer.stop()

            # print("Namesxxx: " + str(l_names))
            # print("All Names: " + str(sorted(LegalDoc.NAMES)))

            # Create a random name dictionary
            l_random_names = dict()
            for l_name in l_names:
                l_random_index = random.randint(0, len(LegalDoc.NAMES) - 1)
                l_random_names[l_name] = LegalDoc.NAMES[l_random_index]

            # Anonymize names
            for i, l_word in enumerate(self.corpus):
                try:
                    if l_word in self.defendant_name:
                        if self.corpus[i - 1] == "Defendant":
                            del self.corpus[i]
                        else:
                            self.corpus[i] = "Defendant"
                    elif l_word in self.judge_name:
                        if self.corpus[i - 1] == "Judge":
                            del self.corpus[i]
                        else:
                            self.corpus[i] = "Judge"
                    elif l_word in l_names and l_word not in LegalDoc.MONTHS:
                        self.corpus[i] = l_random_names[l_word]

                except IndexError:
                    print("FAIL")
                    continue

            self.generate_sections_from_corpus()

            # TODO - Timer stop
            Timers.s_anonymize_names_timer.stop()

    def __extract_sentencing_identifier(self):
        """"
        Extracts the sentencing identifier from this legal document's head
        sets the value of "__f_sentencing_document" as a bool
        :rtype: bool
        :return: Whether the sentencing identifier was successfully extracted and set
        """

        # Extract sentencing identifier
        l_sentencing_identifier_match = LegalDoc.__SENTENCING_IDENTIFIER_PATTERN.match(
            self.__f_head)
        if l_sentencing_identifier_match:
            self.__f_sentencing_document = True
            return True

        # Handle non sentencing document
        else:
            LegalDoc.__note_exception(
                self.path, "MAJOR ERROR: This is not a sentencing document",
                True)
            LegalDoc.s_sentencing_error += 1
            return False

    # Extract case number
    def __extract_case_number(self):
        """"
        Extracts the case number from this legal document's head and cleans it
        sets the value of "l_case_num_match" as a string
        :rtype: bool
        :return: Whether the case number was successfully extracted and set
        """

        # Extract case number
        l_case_num_match = LegalDoc.__CASE_NUMBER_PATTERN.match(self.__f_head)
        if l_case_num_match:

            # Extract case number whilst removing dashes, spaces and tabs
            self.__f_case_number = (l_case_num_match.groups())[0].translate(
                {ord(c): None
                 for c in r'-    '})
            return True

        # Handle failure to find a case number
        else:
            LegalDoc.__note_exception(self.path,
                                      "ERROR: Unable to find case number",
                                      LegalDoc.EXIT_IF_ERRORS)
            LegalDoc.s_case_error += 1

            if LegalDoc.EXIT_IF_ERRORS:
                return False

    # Extract defendant's name
    def __extract_defendant_name(self):
        """"
        Extracts the defendant's name from this legal document's head
        Removes initials from the defendant's name
        Ensure the defendant's name is all lower case with the exception of the first letter
        sets the value of "__f_defendant_name" as a set of strings
        (e.g. "John Smith" becomes {"John", "Smith"})
        :rtype: bool
        :return: Whether the defendant's name was successfully extracted and set
        """

        l_defendant_name_match = LegalDoc.__DEFENDANT_NAME_PATTERN.match(
            self.head)

        # Check for regex match
        if l_defendant_name_match:

            # Clean name and set value of "__f_defendant_name"
            self.__f_defendant_name = (
                l_defendant_name_match.groups())[0].strip()
            self.__f_defendant_name = LegalDoc.NAME_SIMPLIFIER_PATTERN.findall(
                self.defendant_name)
            self.__f_defendant_name = set(
                [x.lower().capitalize() for x in self.defendant_name])
            return True

        # Handle inability to determine defendant's name
        else:
            LegalDoc.__note_exception(
                self.path, "MAJOR ERROR: Unable to find defendant's name",
                True)
            LegalDoc.s_defendant_error += 1
            return False

    # Extract judge's name
    def __extract_judge_name(self):
        """"
        Extracts the judge's name from this legal document's head
        Removes initials from the judge's name
        Ensure the judge's name is all lower case with the exception of the first letter
        sets the value of "__f_judge_name" as a set of strings
        (e.g. "John Smith" becomes {"John", "Smith"})
        :rtype: bool
        :return: Whether the judge's name was successfully extracted and set
        """

        # Check for regex match
        l_judge_name_match = LegalDoc.__JUDGE_NAME_PATTERN.match(self.head)
        if l_judge_name_match:

            # Clean name and set value of "__f_judge_name"
            self.__f_judge_name = (l_judge_name_match.groups())[0].strip()
            self.__f_judge_name = LegalDoc.NAME_SIMPLIFIER_PATTERN.findall(
                self.judge_name)
            self.__f_judge_name = set(
                [x.lower().capitalize() for x in self.judge_name])
            return True

        # Handle inability to determine judge's name
        else:
            LegalDoc.__note_exception(
                self.path, "MAJOR ERROR: Unable to find judge's name", True)
            LegalDoc.s_judge_error += 1
            return False

    # Tokenize each sentence in each section in the body
    def __group_lines_into_sections(self, a_lines):
        """"
        Groups the provided list of lines into sections comprised of sentences
        :type a_lines: list
        :rtype: bool
        :return: Whether the lines were successfully grouped into sections
        """

        l_sections = []

        # Group lines into sections
        try:
            l_section_index = 0  # Used to check whether sections are being missed
            l_sections.append(
                ""
            )  # l_sections[0] catches any lines prior to the first section
            l_bad_sections = False  # True if any problems are encountered whilst parsing sections

            # For each line in l_lines...
            for l_line in a_lines:
                l_section_match = LegalDoc.__SECTION_PATTERN.match(l_line)

                # Check if the line contains the start of a section
                if l_section_match:

                    # Remove the section number from the line
                    l_line = LegalDoc.__SECTION_PATTERN.sub(
                        r"\g<2>", l_line, 1)

                    # If the section number in the line matches the l_section_index
                    if l_section_match[1] == str(l_section_index + 1):
                        l_section_index += 1
                        l_section = "SECTIONSTART" + str(
                            l_section_index) + ":\t" + l_line
                        l_sections.append(l_section)

                    # A parsing error has occurred
                    else:
                        l_bad_sections = True
                        l_sections[l_section_index] += l_line

                # Check if the line is empty
                elif LegalDoc.__EMPTY_LINE_PATTERN.match(l_line):
                    continue

                # This line is not the start of a section nor is it empty
                else:

                    # The line is part of a section
                    if l_section_index > 0:
                        l_sections[l_section_index] += l_line

                    # This line is prior to all sections. Add it to section 0
                    else:
                        l_bad_sections = True
                        l_sections[l_section_index] += l_line

            # Handle section parsing errors
            if l_bad_sections:
                self.__f_contains_errors = True
                LegalDoc.__note_exception(self.path, "ERROR: Bad section(s)",
                                          LegalDoc.EXIT_IF_ERRORS)
                LegalDoc.s_section_error += 1
                if LegalDoc.EXIT_IF_ERRORS:
                    return False

            # Break up sections into sentences
            # Add N sentence arrays to body, where N is the number of sections
            for l_section in l_sections:
                self.body.append(
                    LegalDoc.SENTENCE_TOKENIZER.tokenize(l_section))

            return True

        # Handle failure to parse document's body
        except (TypeError, AttributeError, IndexError):
            LegalDoc.__note_exception(
                self.path, "MAJOR ERROR: Unable to break down body", True)
            LegalDoc.s_body_error += 1
            return False

    # Tokenize each sentence in each section in the body
    def tokenize_sentences(self):

        # Check if sentences are already tokenized
        if not self.tokenized_sentences:
            l_tokenized_sections = []

            # By section
            for l_section in self.body:
                l_tokenized_sentences = []

                # By sentence
                for l_sentence in l_section:

                    # Add tokenized sentences to section
                    l_tokenized_sentences.append(word_tokenize(l_sentence))

                # Add tokenized sections to sections list
                l_tokenized_sections.append(l_tokenized_sentences)

            # Update body
            self.__f_body = l_tokenized_sections

            self.__f_tokenized_sentences = True

    # Detokenize each sentence in each section in the body
    def detokenize_sentences(self):

        # Check if sentences are already tokenized
        if self.tokenized_sentences:
            l_detokenized_sections = []

            # By section
            for l_section in self.body:
                l_detokenized_sentences = []

                # By sentence
                for l_sentence in l_section:

                    # Add detokenized sentences to section
                    l_detokenized_sentences.append(
                        LegalDoc.DETOKENIZER.detokenize(l_sentence,
                                                        return_str=True))

                # Add detokenized sections to sections list
                l_detokenized_sections.append(l_detokenized_sentences)

            # Update body
            self.__f_body = l_detokenized_sections

            self.__f_tokenized_sentences = False

    # Generates untokenized sections from the words in the corpus
    def generate_sections_from_corpus(self):
        """"
        This method will not work properly if punctuation has been removed
        or if all words have been lower cased
        """

        # TODO - Timer start
        Timers.s_gen_secs_from_corpus_timer.start()

        self.__f_body = []
        l_section_words = []

        for l_word in self.corpus:
            if LegalDoc.SECTION_IDENTIFIER_PATTERN.match(l_word):

                # Detokenize section words list into a string
                l_detokenized_section = LegalDoc.DETOKENIZER.detokenize(
                    l_section_words, return_str=True)

                # Tokenize section string into sentences (a list of strings)
                self.body.append(
                    LegalDoc.SENTENCE_TOKENIZER.tokenize(
                        l_detokenized_section))

                # New section
                l_section_words = [l_word]
            else:
                # Add word to section words list
                l_section_words.append(l_word)

        # TODO - Timer stop
        Timers.s_gen_secs_from_corpus_timer.stop()

    # Creates a corpus from the sentences in the body's sections
    def generate_corpus_from_sections(self):
        """"
        This method will not work properly if punctuation has been removed
        or if all words have been lower cased
        """

        # TODO - Timer start
        Timers.s_gen_corpus_from_secs_timer.start()

        # Check whether sentences are tokenized already
        if self.tokenized_sentences:

            # Create corpus from tokenized sentences
            for l_section in self.body:
                for l_sentence in l_section:
                    for l_word in l_sentence:
                        self.corpus.append(l_word)

        else:

            # Create corpus from untokenized sentences
            for l_section in self.body:
                for l_sentence in l_section:
                    for l_word in word_tokenize(l_sentence):
                        self.corpus.append(l_word)

        # TODO - Timer stop
        Timers.s_gen_corpus_from_secs_timer.stop()

    # Creates a corpus from the sentences in the body's sections
    def __clean_sections(self):
        if LegalDoc.CLEAN_DATA:

            # TODO - Timer start
            Timers.s_clean_sections_timer.start()

            # Tokenize sections
            self.tokenize_sentences()

            # Clean data
            l_filtered_sections = []

            # By section
            for l_section in self.body:
                l_filtered_sentences = []

                # By sentence
                for l_sentence in l_section:
                    l_filtered_words = []

                    # By word
                    for l_word in l_sentence:

                        # Remove stopwords
                        if l_word in LegalDoc.STOP_WORDS and LegalDoc.REMOVE_STOP_WORDS:
                            continue

                        # Stemming
                        if LegalDoc.APPLY_STEMMING:
                            l_word = LegalDoc.STEMMER.stem(l_word)

                        # Lemmatization
                        if LegalDoc.APPLY_LEMMATIZATION:
                            l_word = LegalDoc.LEMMATIZER.lemmatize(l_word)

                        # Remove punctuation
                        if LegalDoc.REMOVE_PUNCTUATION:
                            l_word = l_word.translate(
                                str.maketrans('', '', string.punctuation))

                        # To lower case
                        if LegalDoc.TO_LOWER_CASE:
                            l_word = l_word.lower()

                        # Add filtered word to sentence
                        l_filtered_words.append(l_word)

                    # Add filtered sentence to section
                    l_filtered_sentences.append(l_filtered_words)

                # Add filtered section to section list
                l_filtered_sections.append(l_filtered_sentences)

            # Update body
            self.__f_body = l_filtered_sections

            # Create corpus from sections
            self.generate_corpus_from_sections()

            # Detokenize sentences
            self.detokenize_sentences()

            # TODO - Timer stop
            Timers.s_clean_sections_timer.stop()

    # Strip section identifiers
    def strip_section_identifiers(self, a_generate_corpus=True):
        for l_section in self.body:
            for i, l_sentence in enumerate(l_section):
                l_section[i] = LegalDoc.SECTION_IDENTIFIER_PATTERN.sub(
                    "", l_sentence, 1)

        if a_generate_corpus:
            self.generate_corpus_from_sections()

    # Save formatting as a txt file
    def write(self, a_raw_text=False, a_prefix="", a_new_path=""):
        """"
        Writes the data in this instance to a .TXT file
        The case name and number are used to name the file
        """

        # TODO - Timer start
        Timers.s_write_timer.start()

        # Make sure that "CaseName" and  "CaseNumber" do not contain illegal values and are not excessively long
        l_safe_file_name = re.sub(r'[\\/:"*?<>|]+', "", self.file_name)
        l_safe_file_name = (
            l_safe_file_name[:25] +
            '..') if len(l_safe_file_name) > 25 else l_safe_file_name

        # l_safe_case_number = re.sub(r'[\\/:"*?<>|]+', "", self.case_number)
        # l_safe_case_number = (l_safe_case_number[:25] + '..') if len(l_safe_case_number) > 25 else l_safe_case_number

        # Add brackets to prefix if specified
        if a_prefix:
            a_prefix = "(" + a_prefix + ")"

        # Select path string based on input
        if a_new_path:
            l_path = a_new_path
        else:
            l_path = "Resources/Output/Formatted/"

        # Make path if it doesn't exist
        if not os.path.exists(l_path):
            os.makedirs(l_path)

        # Save file
        l_save_file = None
        try:
            l_save_file = open(l_path + a_prefix + "(F) " + l_safe_file_name,
                               "w",
                               encoding="UTF-8")

            # Remove section identifiers
            self.strip_section_identifiers(True)

            # Write formatted LegalDoc
            if not a_raw_text:
                l_save_file.write(self.__str__())

            # Only write body's contents (unformatted)
            else:
                for l_section in self.body:
                    for l_sentence in l_section:
                        l_save_file.write(l_sentence + " \n")

            l_save_file.close()

            # TODO - Timer start
            Timers.s_write_timer.stop()

        # Handle IO Exception
        except IOError:
            print("ERROR: Unable to save file with path: " + self.path)
            l_save_file.close()

            # TODO - Timer start
            Timers.s_write_timer.stop()

    # ----Method Overrides----
    # Override str(self) with formatted body output
    def __str__(self):

        # Write field data
        l_info = "FIELD DATA:\n"
        l_info += "\tFILE NAME:\n\t\t" + self.file_name + "\n"
        l_info += "\tCASE NUMBER:\n\t\t" + self.case_number + '\n'
        l_info += "\tJUDGE NAME:\n\t\t" + str(self.judge_name) + '\n'
        l_info += "\tDEFENDANT NAME:\n\t\t" + str(self.defendant_name) + '\n'

        l_info += "\tPRISON DOCUMENT:\n\t\t" + str(
            self.sentencing_document) + '\n'
        l_info += "\tPUNCTUATION REMOVED:\n\t\t" + str(
            self.punctuation_removed) + '\n'
        l_info += "\tLOWER CASE:\n\t\t" + str(self.lower_case) + '\n'
        l_info += "\tSTOP WORDS REMOVED:\n\t\t" + str(
            self.stop_words_removed) + '\n'
        l_info += "\tSTEMMED:\n\t\t" + str(self.stemmed) + '\n'
        l_info += "\tLEMMATIZED:\n\t\t" + str(self.lemmatized) + '\n'
        l_info += "\tCONTAINS ERRORS:\n\t\t" + str(self.contains_errors) + '\n'

        l_info += "SECTIONS:" + '\n'

        # Write the section headers
        for i in range(0, len(self.body)):
            l_section = self.body[i]
            l_info += '\t' "Section:" + str(i) + '\n'

            # Write the sentences corresponding to the above section
            for l_sentence in l_section:
                l_info += "\t\t" + l_sentence + '\n'

        return l_info

    # ----Class Methods----
    # Prints all the exception data in the exception dict as well as some basic summary statistics
    @classmethod
    def print_exception_data(cls):

        # Write general error data
        l_error_data = "Successful initialisations: " + str(
            cls.s_successful_init_count) + '\n'
        l_error_data += "Failed initialisations: " + str(
            cls.s_failed_init_count) + '\n'
        l_error_data += "Success rate: " + \
                        str((cls.s_successful_init_count * 1.0) /
                            ((cls.s_failed_init_count * 1.0) + (cls.s_successful_init_count * 1.0))) + '\n'

        l_error_data += ("File errors: " + str(LegalDoc.s_file_error) +
                         "Case errors: " + str(LegalDoc.s_case_error) +
                         "Judge errors: " + str(LegalDoc.s_judge_error) +
                         "Defendant errors: " +
                         str(LegalDoc.s_defendant_error) +
                         "Sentencing errors: " +
                         str(LegalDoc.s_sentencing_error) +
                         "Section errors: " + str(LegalDoc.s_section_error) +
                         "Body errors: " + str(LegalDoc.s_body_error) +
                         "Unknown errors: " + str(LegalDoc.s_unknown_error))

        l_error_data += "Exceptions: " + '\n'

        # For each LegalDoc containing one or more errors
        # l_path is the key, l_errors is the value
        for l_path, l_errors in cls.s_exception_data_dict.items():

            # Write the path of the LegalDoc
            l_error_data += "\t" + l_path + '\n'

            # Write the errors associated with the above LegalDoc
            for l_error in l_errors:
                l_error_data += "\t\t" + l_error + '\n'

        print(l_error_data)

    # Notes an exception
    @classmethod
    def __note_exception(cls, a_path: str, a_exception: str,
                         a_failed_init: bool):
        """"
        Adds the provided exception data to the exception dict using the provided path.
        Also increments the classes failed init count
        :type a_path: str
        :type a_exception: str
        :type a_failed_init: bool
        """

        # If the path already exists in the exception dictionary
        if a_path in cls.s_exception_data_dict:
            cls.s_exception_data_dict[a_path].append(a_exception)

        # Add the path to the exception dictionary
        else:
            cls.s_exception_data_dict[a_path] = [a_exception]

        if a_failed_init:
            # Increment static counter for failed initialisation
            cls.s_failed_init_count += 1

    # TODO Work on this method
    @classmethod
    def get_docs_by_regex(cls, a_regex, a_filter_labelled):
        """
        Gets a list of LegalDocs whose bodies' match the provided pattern
        :param str a_regex: A pattern used to get LegalDocs
        :return: A list of LegalDocs
        :rtype: list

        """
        assert (isinstance(a_regex, re.Pattern))

        l_matching_docs = []
        l_break = False

        for l_legal_doc in LegalDoc.s_legal_doc_dict.values():
            l_break = False

            for l_section in l_legal_doc.body:
                if l_break:
                    break

                for l_sentence in l_section:

                    # Look for match
                    l_match = a_regex.match(l_sentence)

                    if l_match:
                        # TODO Annoying import bug, fix later
                        if not Label.Label.s_flat_labels_dict[
                                l_legal_doc.
                                file_name] or not a_filter_labelled:
                            l_matching_docs.append(l_legal_doc.file_name)
                        l_break = True
                        break

        print(l_matching_docs)
        print(len(l_matching_docs))

    # ----Properties (Read only getters)----
    # Origin path
    @property
    def path(self):
        """"
        :rtype: str
        :return: The path of the file that originally generated this LegalDoc instance
        """

        return self.__f_path

    # File name
    @property
    def file_name(self):
        """"
        :rtype: str
        :return: The name of the file that originally generated this LegalDoc instance
        """

        return self.__f_file_name

    # Head
    @property
    def head(self):
        """"
        :rtype: str
        :return: Summary information of the court proceeding
        """

        return self.__f_head

    # Body
    @property
    def body(self):
        """"
        :rtype: list
        :return: The transcript of the court proceeding. Broken down into sections (list) comprised of sentences (str)
        """

        return self.__f_body

    # Case number
    @property
    def case_number(self):
        """"
        :rtype: str
        :return: The document's case number
        """

        return self.__f_case_number

    # Judge's name
    @property
    def judge_name(self):
        """"
        :rtype: str
        :return: The judge's name
        """

        return self.__f_judge_name

    # Defendant name
    @property
    def defendant_name(self):
        """"
        :rtype: str
        :return: The defendant's name
        """

        return self.__f_defendant_name

    # Sentencing document
    @property
    def sentencing_document(self):
        """"
        :rtype: bool
        :return: Whether the document pertains to the sentencing of an individual
        """

        return self.__f_sentencing_document

    # Punctuation Removed
    @property
    def punctuation_removed(self):
        """"
        :rtype: bool
        :return: Whether the document's contents have had punctuation removed
        """

        return self.__f_punctuation_removed

    # Lower case
    @property
    def lower_case(self):
        """"
        :rtype: bool
        :return: Whether the document's contents have been converted to lower case
        """

        return self.__f_lower_case

    # Stop words removed
    @property
    def stop_words_removed(self):
        """"
        :rtype: bool
        :return: Whether the document's contents have been stripped of stop words
        """

        return self.__f_stop_words_removed

    # Stemmed
    @property
    def stemmed(self):
        """"
        :rtype: bool
        :return: Whether the document's contents have been stemmed
        """

        return self.__f_stemmed

    # Lemmatized
    @property
    def lemmatized(self):
        """"
        :rtype: bool
        :return: Whether the document's contents have been lemmatized
        """

        return self.__f_lemmatized

    # Contains errors
    @property
    def contains_errors(self):
        """"
        :rtype: bool
        :return: Whether the document contains broken sections or a missing case number or both
        """

        return self.__f_contains_errors

    # Tokenized sentences
    @property
    def tokenized_sentences(self):
        """"
        :rtype: bool
        :return: Whether the document's sections contain tokenized sentences
        """

        return self.__f_tokenized_sentences

    # Corpus
    @property
    def corpus(self):
        """"
        :rtype: list
        :return: Corpus with each index corresponding to a single word
        """

        return self.__f_corpus

    # ----Static Methods----
    @staticmethod
    def index(a_list, a_value):
        """Locate the leftmost value exactly equal to x"""

        i = bisect.bisect_left(a_list, a_value)
        if i != len(a_list) and a_list[i] == a_value:
            return i
        return None
コード例 #9
0
ファイル: detokenizer.py プロジェクト: sebag90/easymt
 def __init__(self, language):
     self.language = language
     self.detokenizer = MosesDetokenizer(lang=language)
コード例 #10
0
    def morph(self, source, reference, constrain_pos=True):
        # Return Format (raw, translation, is attack success, query number, modif_rate)

        orig_tokenized = MosesTokenizer(lang='en').tokenize(source)
        # skip too long or too short question
        if len(orig_tokenized) < 10 or len(orig_tokenized) > 100:
            return source, reference, None, None, None

        # generate candidates
        pos_tagged = [
            (tagged[0], '.') if '&' in tagged[0] else tagged
            for tagged in nltk.pos_tag(orig_tokenized, tagset='universal')
        ]

        token_inflections = self.get_inflections(orig_tokenized, pos_tagged,
                                                 constrain_pos)

        # get original bleu
        original_bleu, orig_predicted = self.get_bleu(source, reference)

        # skip examples already have glue == 0
        if original_bleu == 0:
            return source, reference, None, None, None

        forward_perturbed, forward_bleu, forward_predicted, num_queries_forward = self.search_nmt(
            token_inflections, orig_tokenized, source, original_bleu,
            reference)

        if forward_bleu == original_bleu:
            forward_predicted = orig_predicted

        # attack success
        if forward_bleu == 0:
            modif_rate = self.get_modif_rate(orig_tokenized, forward_perturbed)
            attack_text = MosesDetokenizer(
                lang='en').detokenize(forward_perturbed)
            return attack_text, forward_predicted, True, num_queries_forward + 1, modif_rate

        backward_perturbed, backward_bleu, backward_predicted, num_queries_backward = self.search_nmt(
            token_inflections,
            orig_tokenized,
            source,
            original_bleu,
            reference,
            backward=True)

        if backward_bleu == original_bleu:
            backward_predicted = orig_predicted
        num_queries = 1 + num_queries_forward + num_queries_backward
        if forward_bleu < backward_bleu:
            is_attack_success = False
            if forward_bleu == 0:
                is_attack_success = True
            modif_rate = self.get_modif_rate(orig_tokenized, forward_perturbed)
            attack_text = MosesDetokenizer(
                lang='en').detokenize(forward_perturbed)
            return attack_text, forward_predicted, is_attack_success, num_queries, modif_rate
        else:
            is_attack_success = False
            if backward_bleu == 0:
                is_attack_success = True
            modif_rate = self.get_modif_rate(orig_tokenized,
                                             backward_perturbed)
            attack_text = MosesDetokenizer(
                lang='en').detokenize(backward_perturbed)
            return attack_text, backward_predicted, is_attack_success, num_queries, modif_rate
コード例 #11
0
    def local_search_nmt(self,
                         token_inflections,
                         orig_tokenized,
                         original,
                         original_bleu,
                         reference,
                         backward=False):
        perturbed_tokenized = orig_tokenized.copy()

        best_bleu = original_bleu
        num_queries = 0
        best_predicted = ''

        detokenizer = MosesDetokenizer(lang='en')

        while True:
            new_tokenized_list = []
            new_bleu_list = []
            new_predicted_list = []

            for position, candidates in token_inflections:  # list of pairs (position, candidates) candidates: list of token
                # add or swap
                for infl in candidates:

                    if perturbed_tokenized[position] == infl:
                        continue

                    # do replace
                    new_tokenized = perturbed_tokenized.copy()
                    new_tokenized[position] = infl
                    # form text and eval
                    new_text = detokenizer.detokenize(new_tokenized)
                    new_bleu, new_predicted = self.get_bleu(
                        new_text, reference)
                    num_queries += 1

                    # record
                    new_tokenized_list.append(new_tokenized)
                    new_bleu_list.append(new_bleu)
                    new_predicted_list.append(new_predicted)

                # remove
                if perturbed_tokenized[position] != orig_tokenized[position]:
                    # do replace
                    new_tokenized = perturbed_tokenized.copy()
                    new_tokenized[position] = orig_tokenized[position]

                    # form text and eval
                    new_text = detokenizer.detokenize(new_tokenized)
                    new_bleu, new_predicted = self.get_bleu(
                        new_text, reference)
                    num_queries += 1

                    # record
                    new_tokenized_list.append(new_tokenized)
                    new_bleu_list.append(new_bleu)
                    new_predicted_list.append(new_predicted)

            if len(new_bleu_list) == 0:  # no improve
                break

            cur_best_idx = np.argsort(new_bleu_list)[0]
            cur_best_bleu = new_bleu_list[cur_best_idx]
            cur_best_predicted = new_predicted_list[cur_best_idx]
            cur_best_tokenized = new_tokenized_list[cur_best_idx]

            # check stop criteria
            if cur_best_bleu == 0:
                perturbed_tokenized = cur_best_tokenized
                best_bleu = cur_best_bleu
                best_predicted = cur_best_predicted
                break

            if cur_best_bleu < best_bleu - EPSILON:
                perturbed_tokenized = cur_best_tokenized
                best_bleu = cur_best_bleu
                best_predicted = cur_best_predicted
            else:
                break

        # =============== check supplement set ======================
        # form supplement set
        supplement_inflections_by_position = {
            position: []
            for position, _ in token_inflections
        }
        for position, candidates in token_inflections:
            for infl in candidates:
                if perturbed_tokenized[position] != infl:
                    supplement_inflections_by_position[position].append(infl)

        is_sup_valid = True
        valid_positions = []
        for position, _ in token_inflections:
            if len(supplement_inflections_by_position[position]) > 1:
                is_sup_valid = False
                break
            if len(supplement_inflections_by_position[position]) == 1:
                valid_positions.append(position)

        if len(valid_positions) == 0:
            is_sup_valid = False

        if is_sup_valid:
            print('check supplement')
            supplement_tokenized = perturbed_tokenized.copy()
            for position in valid_positions:
                supplement_tokenized[
                    position] = supplement_inflections_by_position[position][0]

            # form text and eval
            supp_text = detokenizer.detokenize(supplement_tokenized)
            supp_bleu, supp_predicted = self.get_bleu(supp_text, reference)
            num_queries += 1

            if supp_bleu < best_bleu:
                best_bleu = supp_bleu
                best_predicted = supp_predicted
                perturbed_tokenized = supplement_tokenized

        return perturbed_tokenized, best_bleu, best_predicted, num_queries
コード例 #12
0
 def hash_fn(self, string_to_hash):
     '''Simple hash function'''
     md = MosesDetokenizer()
     if type(string_to_hash) == list:
         string_to_hash = md.detokenize(string_to_hash)
     return hashlib.sha224(string_to_hash.encode('utf-8')).hexdigest()
コード例 #13
0
Created on Tue Feb 19 15:06:29 2019

@author: peterawest
"""

from itertools import combinations
import torch
from os import listdir
from random import shuffle
from nltk.tokenize import sent_tokenize
from sacremoses import MosesTokenizer, MosesDetokenizer
from gpt2_token_mod import gpt2_split, gpt2_join
from pytorch_pretrained_bert import GPT2Tokenizer

mt = MosesTokenizer()
mdt = MosesDetokenizer()
punctuation = ".,:;\"\'"

global_tokenizer = None


def token_split(s, method='split', tokenizer=None):
    ''' Given a string s, tokenize '''
    if method == 'split':
        return s.split()
    if method == 'moses':
        tokenized_text = mt.tokenize(s, return_str=True)
        return tokenized_text.split()
    if method == 'gpt2':
        if tokenizer is None:
            global global_tokenizer
コード例 #14
0
        finnish_stanza_tags_tup.append((j.text, j.xpos))
        finnish_stanza_tags.append(j.xpos)

tups_to_file('finlandes/OUTPUT.txt', finnish_stanza_tags_tup)

print("Finished tagging in Finnish: Helsinkiin")

# Chinese
# Tagging from the corpus GSDSimp
print("Started tagging in Chinese: GSDSimp")
chinese_dep_parse = '../../dependency/UD_Chinese-GSDSimp-master/zh_gsdsimp-ud-test.conllu'

with open(chinese_dep_parse, 'r') as gsdsimp_f:
    gsdsimp_text = conll_text_reader(gsdsimp_f)

detok = MosesDetokenizer()
with open('chino/INPUT.txt', 'w') as f:
    for s in gsdsimp_text:
        sent = detok.detokenize(s)
        f.write(sent + '\n')

nlp_zh = stanza.Pipeline(processors='tokenize,pos,lemma,depparse',
                         tokenize_pretokenized=True,
                         lang='zh')
stanza_model = nlp_zh(gsdsimp_text)

chinese_stanza_tags_tup = []
chinese_stanza_tags = []
for i in stanza_model.sentences:
    for j in i.words:
        chinese_stanza_tags_tup.append((j.text, j.xpos))
コード例 #15
0
class SummaryPicker:
    def  __init__(self, exp_path):
        
        self.exp_path = exp_path
        self.load_experiment()
        self.detokenizer = MosesDetokenizer(lang='en')
        self.truecaser = MosesTruecaser(load_from='sm.cnndm.tc.model')

    def cleanup(self, line, append=False):

        # todo, use proper regex
        line = line.replace('- lrb -', '(')
        line = line.replace('- rrb -', ')')
        line = line.replace('- lsb -', '[')
        line = line.replace('- rsb -', ']')
        line = line.replace('`', "'")
        line = self.detokenizer.detokenize(line.split(' '))

        # line = line.replace(" 's ", "'s ")
        # line = line.replace(" 'd ", "'d ")
        # line = line.replace("' s ", "'s ")
        # line = line.replace(" n '", "n'")
        # line = line.replace(" n' ", "n'")
        line = line.replace(" - - ", " -- ")
        line = line.replace(" - ", "-")
        line = re.sub(r', (\d{3})', r',\1', line)
        line = line.replace("i' m ", "i'm ")
        line = line.replace(" 'll ", "'ll ")
        line = line.replace("' ll ", "'ll ")
        line = re.sub(r" '([a-zA-Z]{1}) ", r"'\1 ", line)
        line = re.sub(r"' ([a-zA-Z]{1}) ", r"'\1 ", line)
        line = re.sub(r" ([a-zA-Z]{1})' ", r" \1'", line)
        line = re.sub(r"(you|they)(' re )", r"\1're ", line)
        line = re.sub(r"(\$\d+\.) (\d+)", r"\1\2", line)
        line = re.sub(r"(\d{1,2}): (\d{1,2}) (am|pm)", r"\1:\2\3", line)
        line = line.replace(" n't ", "n't ")
        line = line.replace(" 've ", "'ve ")

        doc = nlp(line)

        lines = []

        def repr_word(tok):
            txt = tok.text_with_ws
            if tok.text in SPECIAL:
                txt = txt.replace(tok.text, SPECIAL[tok.text])
            elif tok.is_sent_start or tok.ent_type_ in ['PERSON', 'ORG', 'PRODUCT', 'GPE', 'LOC', 'FAC', 'NORP', 'EVENT', 'WORK_OF_ART']:
                txt = txt.capitalize()
            if tok.text.upper() in COMPANIES:
                txt = txt.upper()
            return txt

        for tok in doc:
            lines.append(repr_word(tok))

        line = ''.join(lines)
        if not line.endswith('.') and append:
            line += ' ...'

        return line
        # line = line.replace(" 'm ", "'m ")
        # line = line.replace("' m ", "'m ")

    def cleanup_samples(self, samples):
        return [self.cleanup_sample(sample) for sample in samples]

    def cleanup_sample(self, sample):
        sample['source'] = self.cleanup(sample['source'], append=True)
        sample['summaries'] = { model: self.cleanup(line) for model, line in sample['summaries'].items()}
        return sample
       
    def load_corpus(self, text_path):
        with open(text_path,'r') as f:
            return f.read().splitlines()
        
    def sample(self, n=15, clean=True):
        indices = random.sample(range(len(self.source)), n)
        samples = []
        for idx in indices:
            sample = {}
            sample['index'] = idx
            sample['source'] = self.source[idx]
            sample['summaries'] = {}
            sample['summaries']['gold'] = self.gold[idx]
            for key, docs in self.results.items():
                sample['summaries'][key] = docs[idx]
            
            if clean:
                sample = self.cleanup_sample(sample)
            samples.append(sample)
            
        
        return samples
        
    def load_experiment(self):
        self.results = {}
        
        logging.info('Loading source articles')
        
        self.source = self.load_corpus(os.path.join(self.exp_path, 'src.txt'))
        self.gold = None
        
        for d in os.listdir(self.exp_path):
            res_path = os.path.join(self.exp_path, d)
            if os.path.isdir(res_path):
                if self.gold is None:
                    logging.info('Loading gold summaries')
                    self.gold = self.load_corpus(os.path.join(res_path, 'tar.txt'))
                    
                    assert len(self.gold) == len(self.source)

                logging.info('Loading {}'.format(res_path))
                corpus = self.load_corpus(os.path.join(res_path, 'hyp.txt'))
                assert len(corpus) == len(self.source)
                self.results[os.path.basename(res_path)] = corpus
コード例 #16
0
combi['tidy_tweet'] = combi['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

tokenized_tweet = combi['tidy_tweet'].apply(lambda x: x.split())
tokenized_tweet.head()

from nltk.stem.porter import *
stemmer = PorterStemmer()

tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_tweet.head()

#from mosestokenizer import MosesTokenizer, MosesDetokenizer
#from nltk.tokenize.moses import MosesDetokenizer
from sacremoses import MosesTokenizer, MosesDetokenizer

detokenizer = MosesDetokenizer()

for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = detokenizer.detokenize(tokenized_tweet[i], return_str=True)

combi['tidy_tweet'] = tokenized_tweet

all_words = ' '.join([text for text in combi['tidy_tweet']])
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()
コード例 #17
0
 def  __init__(self, exp_path):
     
     self.exp_path = exp_path
     self.load_experiment()
     self.detokenizer = MosesDetokenizer(lang='en')
     self.truecaser = MosesTruecaser(load_from='sm.cnndm.tc.model')
コード例 #18
0
import requests
import subprocess
import json
import os
import tempfile
from sacremoses import MosesTokenizer, MosesDetokenizer
from collections import defaultdict
from nltk import sent_tokenize

SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))

# PROCESSING TEXT
tokenizer_en = MosesTokenizer(lang='en')
detokenizer_en = MosesDetokenizer(lang='en')
tokenizer_es = MosesTokenizer(lang='es')
detokenizer_es = MosesDetokenizer(lang='es')

MAX_NUM_TOKENS = 10
SPLIT_DELIMITER = ';'
LANGUAGE_ISO_MAP = {'en': 'english', 'es': 'spanish'}


def tokenize(text, lang, return_str=True):
    if lang == 'en':
        text_tok = tokenizer_en.tokenize(text,
                                         return_str=return_str,
                                         escape=False)
        return text_tok
    elif lang == 'es':
        text_tok = tokenizer_es.tokenize(text,
                                         return_str=return_str,
コード例 #19
0
ファイル: wsc_utils.py プロジェクト: yf1291/nlp3
def get_detokenizer():
    from sacremoses import MosesDetokenizer
    detok = MosesDetokenizer(lang='en')
    return detok
コード例 #20
0
    def __init__(self):
        super(RunHP, self).__init__()

        #   GENERAL  #
        self.seed = 42
        self.cuda_device_id = 6
        self.device = 'cuda'  # 'cuda' or 'cpu'
        self.training_logging_step = 50  # how often to print internal metrics
        self.epochs = 10  # if set to 0 will immediately just to evaluation
        self.learning_rate = 0.0005
        self.grads_clip = 0.25

        # GENERAL DATA RELATED #
        self.dataset = 'amazon'
        self.train_max_groups_per_batch = 6
        self.val_max_groups_per_batch = 13
        self.eval_max_groups_per_batch = 20
        self.max_rev_per_group = 8

        #   DATA SOURCES  #
        # `early_term` limits the number of chunks per epoch
        self.train_early_term = None
        self.val_early_term = None
        self.gener_early_term = 2

        #  GENERAL PATHS   #
        self.root_path = 'copycat'
        self.experiments_folder = 'first_run'
        self.output_dir = f'{self.root_path}/runs/{self.dataset}/{self.experiments_folder}'
        self.checkpoint_full_fn = 'checkpoint.tar'
        epc = ExperimentsPathController()
        self.output_path = epc(self.output_dir)
        self.checkpoint_path = f'{self.root_path}/artifacts/{self.dataset}/checkpoint.tar'
        self.tcaser_model_path = f'{self.root_path}/artifacts/{self.dataset}/data/tcaser.model'

        #   DATA PATHS  #
        self.base_data_path = f'data/{self.dataset}/'
        self.train_fp = comb_paths(self.base_data_path, "split/train/")
        self.val_fp = comb_paths(self.base_data_path, 'split/val/')
        self.words_vocab_fp = f'{self.root_path}/artifacts/{self.dataset}/data/words.txt'
        self.eval_dev_fp = comb_paths(self.base_data_path, 'gold', 'val.csv')
        self.eval_test_fp = comb_paths(self.base_data_path, 'gold', 'test.csv')

        #   ANNEALING   #
        self.c_m = 8.
        self.c_r = 0.8
        self.c_kl_ann_max_val = 1.
        self.c_kl_ann_batches = self.epochs * self.train_early_term if self.train_early_term else self.epochs * 10000
        self.z_m = 8.
        self.z_c = 0.8
        self.z_kl_ann_max_val = 1.
        self.z_kl_ann_batches = self.epochs * self.train_early_term if self.train_early_term else self.epochs * 10000

        #   DECODING/GENERATION  #
        self.beam_size = 5
        self.beam_len_norm = True
        self.beam_excl_words = []
        self.block_ngram_repeat = 3  # or None
        self.ngram_mirror_window = 3  # or None
        self.mirror_conjs = ["and", 'or', ',', 'but']  # or None
        self.block_consecutive = True
        self.min_gen_seq_len = 20

        #   POST-PROCESSING AND ANALYTICS #
        mt = MosesTokenizer()
        self.tok_func = partial(mt.tokenize, escape=False)
        self.sent_split_func = nltk.sent_tokenize
        dt = MosesDetokenizer()
        self.detok_func = partial(dt.detokenize, unescape=False)
        true_caser = MosesTruecaser(load_from=self.tcaser_model_path,
                                    is_asr=True)
        self.true_case_func = partial(true_caser.truecase,
                                      return_str=True,
                                      use_known=True)
        self.analytics_func = partial(ngram_seq_analysis,
                                      tokenizer=self.tok_func,
                                      sent_splitter=self.sent_split_func,
                                      n_grams_to_comp=(2, 3, 4))
コード例 #21
0
 def __init__(self):
     super().__init__()
     self._tokenizer = SacreMosesTokenizer()
     self._detokenizer = MosesDetokenizer()
コード例 #22
0
  def load_model(self, src_language, trg_language, domain, bpe_src_code=None, tokenize=None):
    """ Load model for given trg language. """
    # model_dir = "{}-{}".format(self._model_dir_prefix, trg_language)
    model_dir = f"{self._model_dir_prefix}{src_language}-{trg_language}-{domain}"

    # Load the checkpoint.
    ckpt_path = os.path.join(model_dir, 'model.ckpt')
        
    # Load the vocabularies.
    src_vocab_path = os.path.join(model_dir, 'src_vocab.txt')

    trg_vocab_path = os.path.join(model_dir, 'trg_vocab.txt')
    
    # Load the config.
    config_path = os.path.join(model_dir, 'config_orig.yaml')

    # Adjust config.
    config = load_config(config_path)
    new_config_file = os.path.join(model_dir, 'config.yaml')
    config = self._update_config(config, src_vocab_path, trg_vocab_path,
                                 model_dir, ckpt_path)
    with open(new_config_file, 'w') as cfile:
      yaml.dump(config, cfile)

    # print('Loaded model for {}-{}.'.format(self._src_language, trg_language))
    print('Loaded model for {}-{}.'.format(src_language, trg_language))

    conf = {}

    logger = logging.getLogger(__name__)
    conf["logger"] = logger

    # load the Joey configuration
    cfg = load_config(new_config_file)

    # load the checkpoint
    if "load_model" in cfg['training'].keys():
        ckpt = cfg['training']["load_model"]
    else:
        ckpt = get_latest_checkpoint(model_dir)
        if ckpt is None:
            raise FileNotFoundError("No checkpoint found in directory {}."
                                    .format(model_dir))

    # prediction parameters from config
    conf["use_cuda"] = cfg["training"].get("use_cuda", False) if torch.cuda.is_available() else False

    conf["level"] = cfg["data"]["level"]
    conf["max_output_length"] = cfg["training"].get("max_output_length", None)
    conf["lowercase"] = cfg["data"].get("lowercase", False)

    # load the vocabularies
    src_vocab_file = cfg["training"]["model_dir"] + "/src_vocab.txt"
    trg_vocab_file = cfg["training"]["model_dir"] + "/trg_vocab.txt"
    
    conf["src_vocab"] = build_vocab(field="src", vocab_file=src_vocab_file,
                            dataset=None, max_size=-1, min_freq=0)
    conf["trg_vocab"] = build_vocab(field="trg", vocab_file=trg_vocab_file,
                            dataset=None, max_size=-1, min_freq=0)

    # whether to use beam search for decoding, 0: greedy decoding
    if "testing" in cfg.keys():
        conf["beam_size"] = cfg["testing"].get("beam_size", 0)
        conf["beam_alpha"] = cfg["testing"].get("alpha", -1)
    else:
        conf["beam_size"] = 1
        conf["beam_alpha"] = -1

    # pre-processing
    if tokenize is not None:
        src_tokenizer = MosesTokenizer(lang=cfg["data"]["src"])
        trg_tokenizer = MosesDetokenizer(lang=cfg["data"]["trg"])
        # tokenize input
        tokenizer = lambda x: src_tokenizer.tokenize(x, return_str=True)
        detokenizer = lambda x: trg_tokenizer.detokenize(
            x.split(), return_str=True)
    else:
        tokenizer = lambda x: x
        detokenizer = lambda x: x

    if bpe_src_code is not None and level == "bpe":
        # load bpe merge file
        merge_file = open(bpe_src_code, "r")
        bpe = apply_bpe.BPE(codes=merge_file)
        segmenter = lambda x: bpe.process_line(x.strip())
    elif conf["level"] == "char":
        # split to chars
        segmenter = lambda x: list(x.strip())
    else:
        segmenter = lambda x: x.strip()

    conf["preprocess"] = [tokenizer, segmenter]
    conf["postprocess"] = [detokenizer]
    # build model and load parameters into it
    model_checkpoint = load_checkpoint(ckpt, conf["use_cuda"])
    model = build_model(cfg["model"], src_vocab=conf["src_vocab"], trg_vocab=conf["trg_vocab"])
    model.load_state_dict(model_checkpoint["model_state"])
    # ipdb.set_trace()
    if conf["use_cuda"]:
        model.cuda()
    conf["model"] = model
    print("Joey NMT model loaded successfully.")
    return conf
コード例 #23
0
# 8) BLEU scores
# 9) F-measure

# start ther server using:
# java -Djava.io.tmpdir=tmp/ -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer \
# -preload tokenize,ssplit,pos,lemma,ner,parse,depparse \
# -status_port 9000 -port 9000 -timeout 15000 &

import os
import csv
from collections import Counter
import random
from nltk.parse import CoreNLPParser
from nltk.stem import PorterStemmer
from sacremoses import MosesTokenizer, MosesDetokenizer
detokenizer = MosesDetokenizer()
mt = MosesTokenizer()
# wordnet_lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Lexical Parser
parser = CoreNLPParser(url='http://localhost:9000')
import kenlm
import time

# LM link: http://www.keithv.com/software/giga/
VP_2gram_LM = os.path.join("LMs", "lm_giga_64k_vp_2gram",
                           "lm_giga_64k_vp_2gram.arpa")
NVP_2gram_LM = os.path.join("LMs", "lm_giga_64k_nvp_2gram",
                            "lm_giga_64k_nvp_2gram.arpa")
VP_3gram_LM = os.path.join("LMs", "lm_giga_64k_vp_3gram",
コード例 #24
0
ファイル: transforms.py プロジェクト: zhengtong0807/gluon-nlp
 def __init__(self, return_str=True):
     self._return_str = return_str
     from sacremoses import MosesDetokenizer  # pylint: disable=import-outside-toplevel
     self._detokenizer = MosesDetokenizer()
コード例 #25
0
ファイル: tokenization.py プロジェクト: peternara/pororo-nlp
    def load(self, device: str):
        """
        Load user-selected task-specific model

        Args:
            device (str): device information

        Returns:
            object: User-selected task-specific model

        """
        if "sent" in self.config.n_model:
            import nltk

            try:
                nltk.data.find("tokenizers/punkt")
            except LookupError:
                nltk.download("punkt")

            from nltk.tokenize import sent_tokenize

            return PororoSentTokenizer(sent_tokenize, self.config)

        if self.config.n_model == "mecab_ko":
            try:
                import mecab
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install python-mecab-ko with: `pip install python-mecab-ko`"
                )
            model = mecab.MeCab()
            return PororoMecabKoTokenizer(model, self.config)

        if self.config.n_model == "char":
            return PororoCharTokenizer(self.config)

        if self.config.n_model == "jamo":
            return PororoJamoTokenizer(self.config)

        if self.config.n_model == "word":
            return PororoWordTokenizer(self.config)

        if self.config.n_model == "roberta":
            from fairseq.data.encoders.gpt2_bpe import get_encoder

            encoder = download_or_load("misc/encoder.json", self.config.lang)
            vocab = download_or_load("misc/vocab.bpe", self.config.lang)
            model = get_encoder(encoder, vocab)

            with open(encoder, "r") as f_vocab:
                vocab = json.load(f_vocab)
                inv_dict = {v: k for k, v in vocab.items()}

            return PororoRoBERTaTokenizer(model, vocab, inv_dict, self.config)

        if self.config.n_model == "moses":
            try:
                from sacremoses import MosesDetokenizer, MosesTokenizer
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install sacremoses with: `pip install sacremoses`")
            model = MosesTokenizer(lang="en")
            detok = MosesDetokenizer(lang="en")
            return PororoMosesTokenizer(model, detok, self.config)

        if self.config.n_model == "jieba":
            try:
                import jieba
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install jieba with: `pip install jieba`")
            model = jieba.cut
            return PororoJiebaTokenizer(model, self.config)

        if self.config.n_model == "mecab":
            try:
                import fugashi
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install fugashi with: `pip install fugashi`")

            try:
                import ipadic
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install ipadic with: `pip install ipadic`")

            dic_dir = ipadic.DICDIR
            mecabrc = os.path.join(dic_dir, "mecabrc")
            mecab_option = "-d {} -r {} ".format(
                dic_dir,
                mecabrc,
            )
            model = fugashi.GenericTagger(mecab_option)
            return PororoMecabTokenizer(model, self.config)
        else:
            from pororo.tasks.utils.tokenizer import CustomTokenizer

            path = download_or_load(
                f"tokenizers/{self.config.n_model}.zip",
                self.config.lang,
            )

            ext = "json" if "unigram" not in self.config.n_model else "txt"
            merges_filename = (f"{path}/merges.txt" if "unigram"
                               not in self.config.n_model else None)

            model = CustomTokenizer.from_file(
                vocab_filename=f"{path}/vocab.{ext}",
                merges_filename=merges_filename,
                normalize=True if "jpe" not in self.config.n_model else False,
            )
            if "jpe" in self.config.n_model:
                return PororoJamoPairTokenizer(model, self.config)
            if "mecab.bpe" in self.config.n_model:
                return PororoMecabSPTokenizer(model, self.config)
            return PororoSPTokenizer(model, self.config)
コード例 #26
0
ファイル: mtok.py プロジェクト: ffreemt/freemt-utils
"""Do moses tok detok."""
# pylint: disable=invalid-name, unused-import

import sys

try:
    import sacremoses  # noqa: F401
except ModuleNotFoundError:
    import subprocess as sp
    import shlex
    proc = sp.Popen(shlex.split('pip install sacremoses'),
                    stdout=-1,
                    stderr=-1)
    out, err = proc.communicate()
    if err:
        sys.stderr.write('error: %s' % err.decode())
    sys.stdout.write('%s' % out.decode())

from sacremoses import MosesTokenizer, MosesDetokenizer

MTOK = MosesTokenizer().tokenize
MDETOK = MosesDetokenizer().detokenize
mtok = MTOK
mdetok = MDETOK
コード例 #27
0
class NPEndPatternExtractor(SkillExtractor):
    """Identify noun phrases with certain ending words (e.g 'skills', 'abilities') as skills

    Args:
        endings (list): Single words that should identify the ending of a noun phrase
            as being a skill
        stop_phrases (list): Noun phrases that should not be considered skills
        only_bulleted_lines (bool, default True): Whether or not to only consider lines
            that look like they are items in a list
    """
    def __init__(self,
                 endings,
                 stop_phrases,
                 only_bulleted_lines=True,
                 *args,
                 **kwargs):
        self.endings = endings
        self.stop_phrases = stop_phrases
        self.only_bulleted_lines = only_bulleted_lines
        self.detokenizer = MosesDetokenizer()

    def document_skill_counts(self, document):
        """Count skills in the document

        Args:
            document (string) A document for searching, such as a job posting

        Returns: (collections.Counter) skill occurrences in the document
        """
        skill_counts = Counter()
        for cleaned_phrase, _ in self.noun_phrases_matching_endings(document):
            skill_counts[cleaned_phrase] += 1
        return skill_counts

    def candidate_skills(self, job_posting):
        """Generate candidate skills from the job posting

        Args:
            job_posting (job_postings.JobPosting) A single job posting

        Yields: all candidate skills (algorithms.skill_extractors.base.CandidateSkill)
            found in the job posting
        """
        document = job_posting.get("description")
        for cleaned_phrase, context in self.noun_phrases_matching_endings(
                document):
            orig_context = self.detokenizer.detokenize([t[0] for t in context],
                                                       return_str=True)
            logging.info('Yielding candidate skill %s in context %s',
                         cleaned_phrase, orig_context)
            yield CandidateSkill(skill_name=cleaned_phrase,
                                 matched_skill=cleaned_phrase,
                                 confidence=95,
                                 context=orig_context)

    def noun_phrases_matching_endings(self, document):
        """From the given document, generate noun phrases ending with one of the configured terms

        Args:
            document (string) A raw text document, such as a job posting

        Yields:
            tuples, each with two strings:
                - a noun phrase
                - the context of the noun phrase (currently defined as the surrounding sentence)
        """
        document = str(document)
        lines = document.split('\n')
        for line in lines:
            if not self.only_bulleted_lines or is_bulleted(line):
                for noun_phrase, context in noun_phrases_in_line_with_context(
                        line):
                    term_list = noun_phrase.split()
                    if term_list[-1].lower() in self.endings:
                        cleaned_phrase = clean_beginning(noun_phrase).lower()
                        if cleaned_phrase not in self.stop_phrases:
                            yield cleaned_phrase, context
コード例 #28
0
def score_output(args, fname):
    sp = spm.SentencePieceProcessor()
    sp.Load('data_and_models/sim/sim.sp.30k.model')

    detok = MosesDetokenizer('en')
    tok = TreebankWordTokenizer()

    f = open(fname, 'r')
    lines = f.readlines()

    pairs = []
    pairs_bleu = []
    src = None

    for i in lines:
        if i[0] == "T":
            target = i.split()[1:]
            target = " ".join(target).replace("@@ ", "")
            target_bleu = target
            target_sim = make_example(target, detok, tok, sp)
        elif i[0] == "H":
            hyp = i.split()[2:]
            hyp = " ".join(hyp).replace("@@ ", "")
            hyp_bleu = hyp
            hyp_sim = make_example(hyp, detok, tok, sp)
        elif i[0] == "S":
            if src is not None:
                pairs.append((target_sim, hyp_sim, src_sim))
                pairs_bleu.append((target_bleu, hyp_bleu, src_bleu))
            src = i.split()[1:]
            src = " ".join(src).replace("@@ ", "")
            src_bleu = src
            src_sim = make_example(src, detok, tok, sp)

    pairs.append((target_sim, hyp_sim, src_sim))
    pairs_bleu.append((target_bleu, hyp_bleu, src_bleu))

    model = torch.load(args.sim_model_file, map_location='cpu')

    state_dict = model['state_dict']
    vocab_words = model['vocab_words']
    sim_args = model['args']
    model = WordAveraging(sim_args, vocab_words)
    model.load_state_dict(state_dict, strict=True)

    scores = []
    scores_simile = []
    for i in pairs:
        wp1 = Example(i[0])
        wp1.populate_embeddings(model.vocab)
        wp2 = Example(i[1])
        wp2.populate_embeddings(model.vocab)
        wx1, wl1, wm1 = model.torchify_batch([wp1])
        wx2, wl2, wm2 = model.torchify_batch([wp2])
        score = model.scoring_function(wx1, wm1, wl1, wx2, wm2, wl2)
        ref_l = len(i[0])
        hyp_l = len(i[1])
        lp = np.exp(1 - max(ref_l, hyp_l) / float(min(ref_l, hyp_l)))
        simile = lp**args.length_penalty * score.data[0]
        scores_simile.append(simile)
        scores.append(score.data[0])

    print("SIM: {0}".format(np.mean(scores)))
    print("SimiLe: {0}".format(np.mean(scores_simile)))

    fout = open(fname + ".target.out", "w")
    for i in pairs_bleu:
        fout.write(i[0].strip() + "\n")
    fout.close()

    fout = open(fname + ".hyp.out", "w")
    for i in pairs_bleu:
        fout.write(i[1].strip() + "\n")
    fout.close()

    fout = open(fname + ".src.out", "w")
    for i in pairs_bleu:
        fout.write(i[2].strip() + "\n")
    fout.close()

    cmd = "perl multi-bleu.perl {0} < {1}".format(fname + ".target.out",
                                                  fname + ".hyp.out")
    os.system(cmd)
コード例 #29
0
    },
    JOINT_TUNING: {
        'train': gold_train_data_source,
        'val': gold_val_data_source
    }
}

#   TRUECASER   #

tcaser = MosesTruecaser(load_from=run_conf.tcaser_model_path, is_asr=True)
tcase_func = partial(tcaser.truecase, return_str=True, use_known=True)

#   WORD TOKENIZERS / DE-TOKENIZERS   #

mt = MosesTokenizer()
dt = MosesDetokenizer()

#   SUB-WORD TOKENIZER   #

bpe = BPE(glossaries=SPECIAL_TOKENS)
bpe.load(bpcodes_fp=run_conf.bpe_fp)

unsup_tok_func = lambda x: bpe.tokenize(tcase_func(x).split())
gold_tok_func = lambda x: bpe.tokenize(mt.tokenize(tcase_func(x), escape=False)
                                       )
detok_func = lambda x: dt.detokenize(bpe.detokenize(x), unescape=False)

#   DATA PIPELINES AND VOCAB   #

vocab_pipeline = assemble_vocab_pipeline(text_fname=InpDataF.REV_TEXT,
                                         lowercase=False,
コード例 #30
0
# Corpus BLEU with arguments
# Run this file from CMD/Terminal
# Example Command: python3 compute-bleu-args.py test_file_name.txt mt_file_name.txt

import sys
import sacrebleu

# Only if you originally used MosesTokenizer
from sacremoses import MosesDetokenizer
md = MosesDetokenizer(lang='en')

target_test = sys.argv[1]  # Test file argument
target_pred = sys.argv[2]  # MTed file argument

# Open the test dataset human translation file and detokenize the references
refs = []

with open(target_test) as test:
    for line in test:
        line = line.strip().split()
        line = md.detokenize(line)
        refs.append(line)

print("Reference 1st sentence:", refs[0])

refs = [refs]  # Yes, it is a list of list(s) as required by sacreBLEU

# Open the translation file by the NMT model and detokenize the predictions
preds = []

with open(target_pred) as pred: