Beispiel #1
0
def process_questions(questions, return_score_modifiers=False):

    # Make a list
    if not isinstance(questions, list):
        questions = [questions]

    # Clean and tokenize
    prepared_questions = []
    for question in questions:
        question = question.strip()
        prepared_questions.append(
            apply_bpe(tokenize(question)) if question else '##emptyquestion##')

    # Run inference
    answers_list = inference_helper(prepared_questions)

    # Process answers
    prepared_answers_list = []
    for index, answers in enumerate(answers_list):
        answers = detokenize(answers)
        answers = replace_in_answers(answers)
        answers = normalize_new_lines(answers)
        answers_score = score_answers(questions[index], answers)
        best_index, best_score = get_best_score(answers_score['score'])

        if prepared_questions[index] == '##emptyquestion##':
            prepared_answers_list.append(None)
        elif return_score_modifiers:
            prepared_answers_list.append({
                'answers':
                answers,
                'scores':
                answers_score['score'],
                'best_index':
                best_index,
                'best_score':
                best_score,
                'score_modifiers':
                answers_score['score_modifiers']
            })
        else:
            prepared_answers_list.append({
                'answers': answers,
                'scores': answers_score['score'],
                'best_index': best_index,
                'best_score': best_score
            })

    return prepared_answers_list
def process(input_folder, type, output_folder):
    updated_json = open(os.path.join(output_folder, type + ".json"),
                        mode="w",
                        encoding="utf-8")
    file_list = os.listdir(input_folder)
    for filename in file_list:
        if type in filename:
            print("filename", filename)
            json_file = open(os.path.join(input_folder, filename),
                             mode="r",
                             encoding="utf-8")
            data = json.load(json_file)
            upd_trdata = []
            for entry_index, entry in enumerate(data):
                summary = entry['summary']
                summary = detokenize(summary)
                summary = " ".join(word_tokenize(summary))
                upd_entry = entry
                upd_entry['summary'] = summary
                upd_trdata.append(upd_entry)
                if entry_index % 50 == 0:
                    print(entry_index)
            json.dump(upd_trdata, updated_json)
Beispiel #3
0
def detokenize(corpus: iCorpus,
               lang: Lang,
               tokenizer=str,
               model=str,
               progress=True) -> iCorpus:
    if progress:
        corpus = tqdm(corpus)
    if lang == 'en':
        if tokenizer is None or tokenizer == "" or tokenizer == 'moses':
            return (_lazy_load_moses_detokenizer('en').detokenize(
                line.split(' '), return_str=True, unescape=False)
                    for line in corpus)
        elif tokenizer == 'bpe':
            # The bpe tokenizer will not remove \n, but the others will. Make BPE remove \n
            return (_lazy_load_bpe_tokenizer('en', model=model).DecodePieces(
                line.split(' ')).replace('▁', ' ').replace('\n', '')
                    for line in corpus)
        else:
            raise ValueError(f'Unknown tokenizer={tokenizer}')
    elif lang == 'is':
        if tokenizer is None or tokenizer == "":
            return (mideind_tok.detokenize(list(
                mideind_tok.tokenize(line, normalize=False)),
                                           normalize=False) for line in corpus)
        elif tokenizer == 'bpe':
            # The bpe tokenizer will not remove \n, but the others will. Make BPE remove \n
            return (_lazy_load_bpe_tokenizer('is', model=model).DecodePieces(
                line.split(' ')).replace('▁', ' ').replace('\n', '')
                    for line in corpus)
        elif tokenizer == 'moses':
            return (_lazy_load_moses_detokenizer('is').detokenize(
                line.split(' '), return_str=True, unescape=False)
                    for line in corpus)
        else:
            raise ValueError(f'Unknown tokenizer={tokenizer}')
    else:
        raise ValueError(f'Unkown language={lang}')
Beispiel #4
0
def create_json(input_folder, input_summaries, output_folder):
    for filename in os.listdir(input_folder):
        d = None
        with codecs.open(input_folder+filename) as json_data:
            d = json.load(json_data)
        print('filename',input_folder+filename)
        output = []
        for entry in d:
            datetime_object = datetime.strptime(entry['day'], '%m_%d_%y')
            html_file_name = []
            html_file_name.append(datetime_object.strftime("%Y%m%d"))
            visname_homename = entry['vis_name'].replace(" ", "_") + "-" + entry['home_name'].replace(" ", "_")
            visname_homename = visname_homename.replace('D-backs', 'Diamondbacks')
            html_file_name.append(visname_homename)
            html_file_name.append(str(entry['vis_line']['team_runs']) + "-" + str(entry['home_line']['team_runs']))

            files = glob.glob(input_summaries+"*" +"_".join(html_file_name))
            if len(files) < 1:
                print(input_summaries+"*"+"_".join(html_file_name) + " not found")
            elif len(files) > 1:
                print(input_summaries + "*" + "_".join(html_file_name) + " multiple found")
            else:
                fname = files[0]
                with codecs.open(fname, encoding='utf-8') as f:
                    content = f.readlines()
                updated_content = []
                for line in content:
                    words = word_tokenize(detokenize(line.strip().split()))
                    updated_content.append(" ".join(words))
                text = " *NEWPARAGRAPH* ".join(updated_content)
                entry['summary'] = text.split()
                output.append(entry)

        if len(output) > 0:
            with codecs.open(output_folder+'combined_'+filename, 'w+') as outfile:
                json.dump(output, outfile)
            outfile.close()
Beispiel #5
0
def check_grammar(**options: Any) -> str:
    """Do a full spelling and grammar check of the source text"""

    accumul: List[str] = []
    offset = 0
    inneroptions: Dict[str, Union[str, bool]] = {}
    inneroptions["annotate_unparsed_sentences"] = options.get(
        "annotate_unparsed_sentences", True
    )
    inneroptions["ignore_rules"] = options.get("ignore_rules", set())
    annlist: List[str] = []
    format = options.get("format", "json")
    for toklist in sentence_stream(**options):
        len_tokens = len(toklist)
        # Invoke the spelling and grammar checker on the token list
        # Only contains options relevant to the grammar check
        sent = check_tokens(toklist, **inneroptions)
        if sent is None:
            # Should not happen?
            continue

        tokens: List[AnnTokenDict]
        if sent.tree is None:
            # Not parsed: use the raw token list
            tokens = [
                AnnTokenDict(k=d.kind, x=d.txt, o=d.original or d.txt)
                for d in sent.tokens
            ]
        else:
            # Successfully parsed: use the text from the terminals (where available)
            # since we have more info there, for instance on em/en dashes.
            # Create a map of token indices to corresponding terminal text
            assert sent.terminals is not None
            token_map = {t.index: t.text for t in sent.terminals}
            tokens = [
                AnnTokenDict(
                    k=d.kind, x=token_map.get(ix, d.txt), o=d.original or d.txt
                )
                for ix, d in enumerate(sent.tokens)
            ]
        # Maintain token character offsets, accumulated over the entire source text
        token_offsets: Dict[int, int] = dict()
        for ix, t in enumerate(toklist):
            token_offsets[ix] = offset
            offset += len(t.original or t.txt or "")

        # Create a normalized form of the sentence
        cleaned = detokenize(toklist, normalize=True)
        # Extract the annotation list (defensive programming here)
        a: List[Annotation] = getattr(sent, "annotations", cast(List[Annotation], []))
        # Sort in ascending order by token start index, and then by end index
        # (more narrow/specific annotations before broader ones)
        a.sort(key=lambda ann: (ann.start, ann.end))

        if format == "text" or format == "textplustoks":
            arev = sorted(a, key=lambda ann: (ann.start, ann.end), reverse=True)
            cleantoklist: List[CorrectToken] = toklist[:]
            for xann in arev:
                if xann.suggest is None:
                    # Nothing to correct with, nothing we can do
                    continue
                cleantoklist[xann.start + 1].txt = xann.suggest
                if xann.end > xann.start:
                    # Annotation spans many tokens
                    # "Okkur börnunum langar í fisk"
                    # "Leita að kílómeter af féinu" → leita að kílómetri af fénu → leita að kílómetra af fénu
                    # "dást af þeim" → "dást að þeim"
                    # Single-token annotations for this span have already been handled
                    # Only case is one ann, many toks in toklist
                    # Give the first token the correct value
                    # Delete the other tokens
                    del cleantoklist[xann.start + 2 : xann.end + 2]
            txt = detokenize(cleantoklist, normalize=True)
            if options.get("annotations", False):
                for aann in a:
                    annlist.append(str(aann))
                if annlist and not options.get("print_all", False):
                    txt = txt + "\n" + "\n".join(annlist)
                    annlist = []
            accumul.append(txt)

        elif format == "json":
            # Create final dictionary for JSON encoding
            # Convert the annotations to a standard format before encoding in JSON
            annotations: List[AnnDict] = [
                AnnDict(
                    # Start token index of this annotation
                    start=ann.start,
                    # End token index (inclusive)
                    end=ann.end,
                    # Character offset of the start of the annotation in the original text
                    start_char=token_offsets[ann.start],
                    # Character offset of the end of the annotation in the original text
                    # (inclusive, i.e. the offset of the last character)
                    end_char=(
                        token_offsets[ann.end + 1]
                        if ann.end + 1 < len_tokens
                        else offset
                    )
                    - 1,
                    code=ann.code,
                    text=ann.text,
                    detail=ann.detail or "",
                    suggest=ann.suggest or "",
                )
                for ann in a
            ]
            ard = AnnResultDict(
                original=cleaned,
                corrected=sent.tidy_text,
                tokens=tokens,
                annotations=annotations,
            )

            accumul.append(json_dumps(ard))
        elif format == "csv":
            for cann in a:
                accumul.append(
                    "{},{},{},{},{},{}".format(
                        cann.code,
                        cann.original,
                        cann.suggest,
                        cann.start,
                        cann.end,
                        cann.suggestlist,
                    )
                )
        elif format == "m2":
            accumul.append("S {0}".format(cleaned))
            for mann in a:
                accumul.append(
                    "A {0} {1}|||{2}|||{3}|||REQUIRED|||-NONE-|||0".format(
                        mann.start, mann.end, mann.code, mann.suggest
                    )
                )
            accumul.append("")
    if options.get("print_all", True):
        accumstr = " ".join(accumul)
        if annlist:
            # We want the annotations at the bottom
            accumstr = accumstr + "\n" + "\n".join(annlist)
    else:
        accumstr = "\n".join(accumul)
    return accumstr
Beispiel #6
0
def test_grammar(**options: Any) -> Tuple[str, TokenSumType]:
    """Do a full spelling and grammar check of the source text"""

    accumul: List[str] = []
    offset = 0
    alltoks: TokenSumType = []
    inneroptions: Dict[str, Union[str, bool]] = {}
    inneroptions["annotate_unparsed_sentences"] = options.get(
        "annotate_unparsed_sentences", True
    )
    inneroptions["ignore_rules"] = options.get("ignore_rules", set())
    annlist: List[str] = []
    for toklist in sentence_stream(**options):
        # Invoke the spelling and grammar checker on the token list
        # Only contains options relevant to the grammar check
        sent = check_tokens(toklist, **inneroptions)
        if sent is None:
            # Should not happen?
            continue

        # Maintain token character offsets, accumulated over the entire source text
        token_offsets: Dict[int, int] = dict()
        for ix, t in enumerate(toklist):
            token_offsets[ix] = offset
            offset += len(t.original or t.txt or "")

        # Extract the annotation list (defensive programming here)
        a: List[Annotation] = getattr(sent, "annotations", cast(List[Annotation], []))
        # Sort in ascending order by token start index, and then by end index
        # (more narrow/specific annotations before broader ones)
        a.sort(key=lambda ann: (ann.start, ann.end))

        arev = sorted(a, key=lambda ann: (ann.start, ann.end), reverse=True)
        cleantoklist: List[CorrectToken] = toklist[:]
        alltoks.extend(cleantoklist)
        for xann in arev:
            if xann.suggest is None:
                # Nothing to correct with, nothing we can do
                continue
            cleantoklist[xann.start + 1].txt = xann.suggest
            if xann.end > xann.start:
                # Annotation spans many tokens
                # "Okkur börnunum langar í fisk"
                # "Leita að kílómeter af féinu" → leita að kílómetri af fénu → leita að kílómetra af fénu
                # "dást af þeim" → "dást að þeim"
                # Single-token annotations for this span have already been handled
                # Only case is one ann, many toks in toklist
                # Give the first token the correct value
                # Delete the other tokens
                del cleantoklist[xann.start + 2 : xann.end + 2]
        txt = detokenize(cleantoklist, normalize=True)
        if options.get("annotations", False):
            for aann in a:
                annlist.append(str(aann))
            if annlist and not options.get("print_all", False):
                txt = txt + "\n" + "\n".join(annlist)
                annlist = []
        accumul.append(txt)

    accumstr = "\n".join(accumul)

    return accumstr, alltoks
Beispiel #7
0
 def should_be(s1: str, s2: str) -> None:
     toklist = t.tokenize(s1, **options)
     assert s2 == t.detokenize(toklist, **options)
Beispiel #8
0
 def should_be_equal(s: str) -> None:
     toklist = t.tokenize(s, **options)
     assert s == t.detokenize(toklist, **options)
Beispiel #9
0
 def should_be(s1, s2):
     toklist = t.tokenize(s1, **options)
     assert s2 == t.detokenize(toklist, **options)
Beispiel #10
0
def correct_spaces(tokens: Iterable[Tuple[str, str]]) -> str:
    """ Returns a string with a reasonably correct concatenation
        of the tokens, where each token is a (tag, text) tuple. """
    return detokenize(
        Tok(TOK.PUNCTUATION if tag == "c" else TOK.WORD, txt, None)
        for tag, txt in tokens)
Beispiel #11
0
def test_correction():
    SENT = [
        (
            """Hann sagði: "Þú ert fífl"! Ég mótmælti því.""",
            """Hann sagði: „Þú ert fífl“! Ég mótmælti því.""",
        ),
        (
            """Hann sagði: Þú ert "fífl"! Ég mótmælti því.""",
            """Hann sagði: Þú ert „fífl“! Ég mótmælti því.""",
        ),
        (
            """Hann sagði: Þú ert «fífl»! Ég mótmælti því.""",
            """Hann sagði: Þú ert „fífl“! Ég mótmælti því.""",
        ),
        (
            """Hann sagði: ´Þú ert fífl´! Farðu í 3ja sinn.""",
            """Hann sagði: ‚Þú ert fífl‘! Farðu í 3ja sinn.""",
        ),
        (
            """Hann sagði: ´Þú ert fífl´! Farðu í 1sta sinn.""",
            """Hann sagði: ‚Þú ert fífl‘! Farðu í 1sta sinn.""",
        ),
        (
            """Hann sagði: ´Þú ert fífl´! Farðu 2svar í bað.""",
            """Hann sagði: ‚Þú ert fífl‘! Farðu 2svar í bað.""",
        ),
        (
            """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""",
            """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""",
        ),
        ("""Hann sagði: Þú ert ´fífl´! Hringdu í 7771234.""",
         """Hann sagði: Þú ert ‚fífl‘! Hringdu í 7771234."""),
        (
            """Hann sagði: Þú ert (´fífl´)! Ég mótmælti því.""",
            """Hann sagði: Þú ert (‘ fífl‘)! Ég mótmælti því.""",  # !!!
        ),
        ("""Hann "gaf" mér 10,780.65 dollara.""",
         """Hann „gaf“ mér 10,780.65 dollara."""),
        (
            """Hann "gaf" mér €10,780.65.""",
            """Hann „gaf“ mér €10,780.65.""",
        ),
        (
            """Hann "gaf" mér €10.780,65.""",
            """Hann „gaf“ mér €10.780,65.""",
        ),
    ]
    SENT_KLUDGY_ORDINALS_MODIFY = [
        (
            """Hann sagði: ´Þú ert fífl´! Farðu í 3ja herbergja íbúð.""",
            """Hann sagði: ‚Þú ert fífl‘! Farðu í þriggja herbergja íbúð.""",
        ),
        (
            """Hann sagði: ´Þú ert fífl´! Farðu í 1sta sinn.""",
            """Hann sagði: ‚Þú ert fífl‘! Farðu í fyrsta sinn.""",
        ),
        (
            """Hann sagði: ´Þú ert fífl´! Farðu 2svar í bað.""",
            """Hann sagði: ‚Þú ert fífl‘! Farðu tvisvar í bað.""",
        ),
        (
            """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""",
            """Ég keypti fjögurra herbergja íbúð á verði tveggja herbergja.""",
        ),
    ]
    SENT_KLUDGY_ORDINALS_TRANSLATE = [
        (
            """Hann sagði: ´Þú ert fífl´! Farðu í 3ja sinn.""",
            """Hann sagði: ‚Þú ert fífl‘! Farðu í 3ja sinn.""",
        ),
        (
            """Hann sagði: ´Þú ert fífl´! Farðu í 1sta sinn.""",
            """Hann sagði: ‚Þú ert fífl‘! Farðu í 1sta sinn.""",
        ),
        (
            """Hann sagði: ´Þú ert fífl´! Farðu 2svar í bað.""",
            """Hann sagði: ‚Þú ert fífl‘! Farðu 2svar í bað.""",
        ),
        (
            """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""",
            """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""",
        ),
    ]
    SENT_CONVERT_NUMBERS = [
        ("""Hann "gaf" mér 10,780.65 dollara.""",
         """Hann „gaf“ mér 10.780,65 dollara."""),
        ("""Hann "gaf" mér €10,780.65.""", """Hann „gaf“ mér €10.780,65."""),
        (
            """Hann "gaf" mér €10.780,65.""",
            """Hann „gaf“ mér €10.780,65.""",
        ),
    ]
    for sent, correct in SENT:
        s = t.tokenize(sent)
        txt = t.detokenize(s, normalize=True)
        assert txt == correct
    for sent, correct in SENT_KLUDGY_ORDINALS_MODIFY:
        s = t.tokenize(sent, handle_kludgy_ordinals=t.KLUDGY_ORDINALS_MODIFY)
        txt = t.detokenize(s, normalize=True)
        assert txt == correct
    for sent, correct in SENT_KLUDGY_ORDINALS_TRANSLATE:
        s = t.tokenize(sent,
                       handle_kludgy_ordinals=t.KLUDGY_ORDINALS_TRANSLATE)
        txt = t.detokenize(s, normalize=True)
        assert txt == correct
    for sent, correct in SENT_CONVERT_NUMBERS:
        s = t.tokenize(sent, convert_numbers=True)
        txt = t.detokenize(s, normalize=True)
        assert txt == correct