def request_segmented(cls, sent_map, src_lang=None, tgt_lang=None, verbatim=False): """ Translate presegmented sentences args: sent_map: either a list of sentences or a dict[key] of sentences""" data = dict(src_lang=src_lang, tgt_lang=tgt_lang) if isinstance(sent_map, dict): sents = ([ tokenizer.correct_spaces(sent) for sent in sent_map.values() ] if not verbatim else list(sent_map.values())) result = TranslateClient._request(sents, data=data) inst_map = { idx: inst for (idx, inst) in zip(sent_map.keys(), result) } resp = dict(results=inst_map) else: sents = ([tokenizer.correct_spaces(sent) for sent in sent_map] if not verbatim else sent_map) result = TranslateClient._request(sents, data=data) inst_map = {idx: inst for (idx, inst) in enumerate(result)} resp = dict(results=inst_map) return resp
def test_correct_spaces(): s = t.correct_spaces( "Frétt \n dagsins:Jón\t ,Friðgeir og Páll ! 100,8 / 2 = 50.4") assert s == 'Frétt dagsins: Jón, Friðgeir og Páll! 100,8/2 = 50.4' s = t.correct_spaces( "Hitinn var\n-7,4 \t gráður en álverðið var \n $10,348.55.") assert s == 'Hitinn var -7,4 gráður en álverðið var $10,348.55.' s = t.correct_spaces( "\n Breytingin var +4,10 þingmenn \t en dollarinn er nú á €1,3455 ." ) assert s == 'Breytingin var +4,10 þingmenn en dollarinn er nú á €1,3455.'
def test_correct_spaces(): s = t.correct_spaces( "Frétt \n dagsins:Jón\t ,Friðgeir og Páll ! 100,8 / 2 = 50.4") assert s == "Frétt dagsins: Jón, Friðgeir og Páll! 100,8/2 = 50.4" s = t.correct_spaces( "Hitinn var\n-7,4 \t gráður en álverðið var \n $10,348.55.") assert s == "Hitinn var -7,4 gráður en álverðið var $10,348.55." s = t.correct_spaces( "\n Breytingin var +4,10 þingmenn \t en dollarinn er nú á €1,3455 ." ) assert s == "Breytingin var +4,10 þingmenn en dollarinn er nú á €1,3455." s = t.correct_spaces("Jón- sem var formaður — mótmælti málinu.") assert s == "Jón-sem var formaður—mótmælti málinu." s = t.correct_spaces("Það á að geyma mjólkina við 20 ± 3 °C") assert s == "Það á að geyma mjólkina við 20±3° C"
def index_text(text, correct_spaces: bool = False): """Segments contiguous (Icelandic) text into paragraphs and sentences and returns: dictionary of sentence indices to sentences dictionary of paragraph index to constituent sentence indices""" text = prep_text_for_tokenizer(text) tok_stream = bintokenizer.tokenize(text) pgs = tokenizer.paragraphs(tok_stream) pg_idx_to_sent_idx = dict() sent_idx_to_sent = dict() curr_sent_idx = 0 curr_pg_idx = 0 for pg in pgs: sent_idxs = [] for _, sent in pg: curr_sent = list(filter(BIN_Token.is_understood, sent)) curr_sent = tokenizer.normalized_text_from_tokens(curr_sent) if correct_spaces: curr_sent = tokenizer.correct_spaces(curr_sent) sent_idxs.append(curr_sent_idx) sent_idx_to_sent[curr_sent_idx] = curr_sent curr_sent_idx += 1 pg_idx_to_sent_idx[curr_pg_idx] = sent_idxs curr_pg_idx += 1 return pg_idx_to_sent_idx, sent_idx_to_sent
def query(): """ Respond to a query string """ q = request.form.get("q", "").strip()[0:_MAX_QUERY_LENGTH] # Auto-uppercasing can be turned off by sending autouppercase: false in the query JSON auto_uppercase = get_json_bool(request, "autouppercase", True) result = dict() with SessionContext(commit=True) as session: toklist = list( tokenize(q, enclosing_session=session, auto_uppercase=q.islower() if auto_uppercase else False)) actual_q = correct_spaces(" ".join(t.txt or "" for t in toklist)) if Settings.DEBUG: # Log the query string as seen by the parser print("Query is: '{0}'".format(actual_q)) # Try to parse and process as a query is_query = process_query(session, toklist, result) result["is_query"] = is_query result["q"] = actual_q return jsonify(result=result)
def top_persons(limit=_TOP_PERSONS_LENGTH): """ Return a list of names and titles appearing recently in the news """ toplist = dict() bindb = BIN_Db.get_db() with SessionContext(commit=True) as session: q = session.query(Person.name, Person.title, Person.article_url, Article.id) \ .join(Article).join(Root) \ .filter(Root.visible) \ .order_by(desc(Article.timestamp))[0:limit * 2] # Go through up to 2 * N records for p in q: # Insert the name into the list if it's not already there, # or if the new title is longer than the previous one if p.name not in toplist or len(p.title) > len(toplist[p.name][0]): toplist[p.name] = (correct_spaces(p.title), p.article_url, p.id, bindb.lookup_name_gender(p.name)) if len(toplist) >= limit: # We now have as many names as we initially wanted: terminate the loop break with changedlocale() as strxfrm: # Convert the dictionary to a sorted list of dicts return sorted([ dict(name=name, title=tu[0], gender=tu[3], url=tu[1], uuid=tu[2]) for name, tu in toplist.items() ], key=lambda x: strxfrm(x["name"]))
def spin_trigram(first): t1 = t2 = "" candidates = first sent = "" while candidates: sumfreq = sum(freq for _, freq in candidates) r = randint(0, sumfreq - 1) for t3, freq in candidates: if r < freq: if not t3: # End of sentence candidates = [] break if sent: sent += " " + t3 else: sent = t3 t1, t2 = t2, t3 q = session.execute( "select t3, frequency from trigrams " "where t1=:t1 and t2=:t2 order by frequency desc", dict(t1=t1, t2=t2)) candidates = q.fetchall() break r -= freq return correct_spaces(sent)
def append_answers(rd, q, prop_func): """ Iterate over query results and add them to the result dictionary rd """ for p in q: s = correct_spaces(prop_func(p)) ai = ArticleInfo(domain=p.domain, uuid=p.id, heading=p.heading, ts=p.timestamp) rd[s][ai.uuid] = ai # Add to a dict of UUIDs
def tidy_text(self): """ Return a [more] correctly spaced text representation of the sentence """ if self.tree is None: # Not parsed (yet) txt = self.text else: # Use the terminal text representation - it's got fancy em/en-dashes and stuff txt = " ".join(t.text for t in self.terminals) return correct_spaces(txt)
def dump(tokens): print("\n{1}\n{0} tokens:\n".format( len(tokens), tokenizer.correct_spaces(" ".join(t.txt for t in tokens if t.txt)))) for token in tokens: err = token.error_description if err: print("{0}".format(token.txt)) print(" {0}: {1}".format(token.error_code, err))
def top_persons(limit=_TOP_PERSONS_LENGTH): """ Return a list of names and titles appearing recently in the news """ toplist = dict() MAX_TITLE_LENGTH = 64 with SessionContext(commit=True) as session: q = ( session.query(Person.name, Person.title, Person.article_url, Article.id).join(Article).join(Root).filter( Root.visible).order_by(desc(Article.timestamp)) [0:limit * 2] # Go through up to 2 * N records ) def is_better_title(new_title, old_title): len_new = len(new_title) len_old = len(old_title) if len_old >= MAX_TITLE_LENGTH: # Too long: we want a shorter one return len_new < len_old if len_new >= MAX_TITLE_LENGTH: # This one is too long: we don't want it return False # Otherwise, longer is better return len_new > len_old with BIN_Db.get_db() as bindb: for p in q: # Insert the name into the list if it's not already there, # or if the new title is longer than the previous one if p.name not in toplist or is_better_title( p.title, toplist[p.name][0]): toplist[p.name] = ( correct_spaces(p.title), p.article_url, p.id, bindb.lookup_name_gender(p.name), ) if len(toplist) >= limit: # We now have as many names as we initially wanted: terminate the loop break with changedlocale() as strxfrm: # Convert the dictionary to a sorted list of dicts return sorted( [ dict(name=name, title=tu[0], gender=tu[3], url=tu[1], uuid=tu[2]) for name, tu in toplist.items() ], key=lambda x: strxfrm(x["name"]), )
def append_answers(rd, q, prop_func): """ Iterate over query results and add them to the result dictionary rd """ for p in q: s = correct_spaces(prop_func(p)) ai = dict(domain=p.domain, uuid=p.id, heading=p.heading, timestamp=p.timestamp, ts=p.timestamp.isoformat()[0:16], url=p.url) rd[s][p.id] = ai # Add to a dict of UUIDs
def append_answers(rd, q, prop_func): """ Iterate over query results and add them to the result dictionary rd """ for p in q: s = correct_spaces(prop_func(p)) ai = dict( domain=p.domain, uuid=p.id, heading=p.heading, timestamp=p.timestamp, ts=p.timestamp.isoformat()[0:16], url=p.url, ) rd[s][p.id] = ai # Add to a dict of UUIDs
def query_api(version=1): """ Respond to a query string """ if not (1 <= version <= 1): return better_jsonify(valid=False, reason="Unsupported version") if request.method == "GET": q = request.args.get("q", "") else: q = request.form.get("q", "") q = q.strip()[0:_MAX_QUERY_LENGTH] # Auto-uppercasing can be turned off by sending autouppercase: false in the query JSON auto_uppercase = get_json_bool(request, "autouppercase", True) result = dict() ql = q.lower() if ql in _SPECIAL_QUERIES or (ql + "?") in _SPECIAL_QUERIES: result["valid"] = True result["qtype"] = "Special" result["q"] = q if ql in _SPECIAL_QUERIES: result["response"] = _SPECIAL_QUERIES[ql] else: result["response"] = _SPECIAL_QUERIES[ql + "?"] else: with SessionContext(commit=True) as session: toklist = list( tokenize_and_recognize( q, enclosing_session=session, auto_uppercase=q.islower() if auto_uppercase else False, )) actual_q = correct_spaces(" ".join(t.txt or "" for t in toklist)) if Settings.DEBUG: # Log the query string as seen by the parser print("Query is: '{0}'".format(actual_q)) # Try to parse and process as a query is_query = process_query(session, toklist, result) result["valid"] = is_query result["q"] = actual_q return better_jsonify(**result)
def append_names(rd, q, prop_func): """ Iterate over query results and add them to the result dictionary rd, assuming that the key is a person name """ for p in q: s = correct_spaces(prop_func(p)) ai = dict(domain=p.domain, uuid=p.id, heading=p.heading, timestamp=p.timestamp, ts=p.timestamp.isoformat()[0:16], url=p.url) # Obtain the key within rd that should be updated with new # data. This may be an existing key, a new key or None if no # update is to be performed. s = name_key_to_update(rd, s) if s is not None: rd[s][p.id] = ai # Add to a dict of UUIDs
def append_names(rd, q, prop_func): """ Iterate over query results and add them to the result dictionary rd, assuming that the key is a person name """ for p in q: s = correct_spaces(prop_func(p)) ai = dict( domain=p.domain, uuid=p.id, heading=p.heading, timestamp=p.timestamp, ts=p.timestamp.isoformat()[0:16], url=p.url, ) # Obtain the key within rd that should be updated with new # data. This may be an existing key, a new key or None if no # update is to be performed. s = name_key_to_update(rd, s) if s is not None: rd[s][p.id] = ai # Add to a dict of UUIDs
def main(): try: # Read configuration file Settings.read(os.path.join(basepath, "config", "GreynirSimple.conf")) except ConfigError as e: print("Configuration error: {0}".format(e)) exit() with SessionContext(commit=False) as session: bef = datetime(2020, 7, 26, 0, 0, 1) aft = datetime(2020, 7, 27, 0, 0, 1) q = (session.query( Article.url, Article.timestamp, Article.heading, Article.tokens).filter(Article.timestamp > bef).filter( Article.timestamp < aft).order_by(Article.timestamp)) items = list() for r in q.all(): (url, ts, title, tokens) = r text = "" tokens = json.loads(tokens) if not tokens: continue # Paragraphs for p in tokens: # Sentences for s in p: # Tokens for t in s: text += t["x"] + " " d = dict(url=url, timestamp=ts.isoformat(), title=title, text=text) d["text"] = correct_spaces(d["text"]) items.append(d) # print(d) # print(text) # print("____________________________") print(json.dumps(items, ensure_ascii=False, sort_keys=True, indent=4))
def query_person_title(session, name): """ Return the most likely title for a person """ rl = _query_person_titles(session, name) return correct_spaces(rl[0]["answer"]) if rl else ""
def tidy_text(self): """ Return a [more] correctly spaced text representation of the sentence """ return correct_spaces(self.text)
def gen_to_string(g): return tokenizer.correct_spaces(" ".join(t.txt for t in g if t.txt))
def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("infile", nargs="?", type=argparse.FileType("r"), default=sys.stdin) parser.add_argument("--posfile", help="File with POS tags", required=False) parser.add_argument( "-w", "--word-spelling-error-rate", type=float, default=0.3, help="Error rate used for spelling of words.", required=False, ) parser.add_argument("-r", "--rule-chance-error-rate", help="Chance for each rule to be applied", default=0.9, type=float) parser.add_argument("-p", "--parse-online", help="Parse sentence with Greynir if pos not provided", type=bool, default=True) parser.add_argument("--seed", default=1, type=int) parser.add_argument("-t", "--dont-detokenize", action="store_true") parser.add_argument("-n", "--nproc", default=1, type=int) parser.add_argument("-b", "--batch-size", default=1, type=int) args = parser.parse_args() error_generators = [ DativitisErrorRule, MoodErrorRule, NounCaseErrorRule, SwapErrorRule, DuplicateWordsRule, SplitWordsRule, NoiseErrorRule, DeleteSpaceErrorRule, ] error_dataset = ErrorDataset(args.infile, args.posfile, args, error_generators=error_generators) error_loader = torch.utils.data.DataLoader( error_dataset, num_workers=args.nproc, worker_init_fn=worker_init_fn, batch_size=args.batch_size, ) for error_batch in error_loader: for error_sentence in error_batch: if args.dont_detokenize: print(error_sentence) else: print(correct_spaces(error_sentence))
def query_entity_def(session, name): """ Return a single (best) definition of an entity """ rl = _query_entity_definitions(session, name) return correct_spaces(rl[0]["answer"]) if rl else ""
def query_entity_def(session, name): """ Return a single (best) definition of an entity """ rl = _query_entity_titles(session, name) return correct_spaces(rl[0][0]) if rl else ""
def query_person_title(session, name): """ Return the most likely title for a person """ rl = _query_person_titles(session, name) return correct_spaces(rl[0][0]) if rl else ""
def normalize(ex): ice, eng = ex["is"], ex["en"] ice = tokenizer.correct_spaces(ice) eng = NLTK_DETOK.detokenize(eng.split(" ")) return {"is": ice, "en": eng}
def test_correction(): SENT = [ ( """Hann sagði: "Þú ert fífl"! Ég mótmælti því.""", """Hann sagði: „Þú ert fífl“! Ég mótmælti því.""", ), ( """Hann sagði: Þú ert "fífl"! Ég mótmælti því.""", """Hann sagði: Þú ert „fífl“! Ég mótmælti því.""", ), ( """Hann sagði: Þú ert «fífl»! Ég mótmælti því.""", """Hann sagði: Þú ert „fífl“! Ég mótmælti því.""", ), ( """Hann sagði: ´Þú ert fífl´! Farðu í 3ja sinn.""", """Hann sagði: ‚Þú ert fífl‘! Farðu í 3ja sinn.""", ), ( """Hann sagði: ´Þú ert fífl´! Farðu í 1sta sinn.""", """Hann sagði: ‚Þú ert fífl‘! Farðu í 1sta sinn.""", ), ( """Hann sagði: ´Þú ert fífl´! Farðu 2svar í bað.""", """Hann sagði: ‚Þú ert fífl‘! Farðu 2svar í bað.""", ), ( """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""", """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""", ), ( """Hann sagði: Þú ert ´fífl´! Hringdu í 7771234.""", """Hann sagði: Þú ert ‚fífl‘! Hringdu í 7771234.""" ), ( """Hann sagði: Þú ert (´fífl´)! Ég mótmælti því.""", """Hann sagði: Þú ert (´fífl‘)! Ég mótmælti því.""", # !!! ), ( """Hann "gaf" mér 10,780.65 dollara.""", """Hann „gaf“ mér 10,780.65 dollara.""" ), ( """Hann "gaf" mér €10,780.65.""", """Hann „gaf“ mér €10,780.65.""", ), ( """Hann "gaf" mér €10.780,65.""", """Hann „gaf“ mér €10.780,65.""", ), ] SENT_KLUDGY_ORDINALS_MODIFY = [ ( """Hann sagði: ´Þú ert fífl´! Farðu í 3ja herbergja íbúð.""", """Hann sagði: ‚Þú ert fífl‘! Farðu í þriggja herbergja íbúð.""", ), ( """Hann sagði: ´Þú ert fífl´! Farðu í 1sta sinn.""", """Hann sagði: ‚Þú ert fífl‘! Farðu í fyrsta sinn.""", ), ( """Hann sagði: ´Þú ert fífl´! Farðu 2svar í bað.""", """Hann sagði: ‚Þú ert fífl‘! Farðu tvisvar í bað.""", ), ( """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""", """Ég keypti fjögurra herbergja íbúð á verði tveggja herbergja.""", ), ] SENT_KLUDGY_ORDINALS_TRANSLATE = [ ( """Hann sagði: ´Þú ert fífl´! Farðu í 3ja sinn.""", """Hann sagði: ‚Þú ert fífl‘! Farðu í 3ja sinn.""", ), ( """Hann sagði: ´Þú ert fífl´! Farðu í 1sta sinn.""", """Hann sagði: ‚Þú ert fífl‘! Farðu í 1sta sinn.""", ), ( """Hann sagði: ´Þú ert fífl´! Farðu 2svar í bað.""", """Hann sagði: ‚Þú ert fífl‘! Farðu 2svar í bað.""", ), ( """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""", """Ég keypti 4ra herbergja íbúð á verði 2ja herbergja.""", ), ] SENT_CONVERT_TELNOS = [ ( """Hann sagði: Þú ert ´fífl´! Hringdu í 7771234.""", """Hann sagði: Þú ert ‚fífl‘! Hringdu í 777-1234.""" ), ( """Hann sagði: Þú ert ´fífl´! Hringdu í 777 1234.""", """Hann sagði: Þú ert ‚fífl‘! Hringdu í 777-1234.""" ), ] SENT_CONVERT_NUMBERS = [ ( """Hann "gaf" mér 10,780.65 dollara.""", """Hann „gaf“ mér 10.780,65 dollara.""" ), ( """Hann "gaf" mér €10,780.65.""", """Hann „gaf“ mér €10.780,65.""" ), ( """Hann "gaf" mér €10.780,65.""", """Hann „gaf“ mér €10.780,65.""", ), ] for sent, correct in SENT: s = t.tokenize(sent) txt = t.correct_spaces(" ".join(token.txt for token in s if token.txt)) assert txt == correct for sent, correct in SENT_KLUDGY_ORDINALS_MODIFY: s = t.tokenize(sent, handle_kludgy_ordinals=t.KLUDGY_ORDINALS_MODIFY) txt = t.correct_spaces(" ".join(token.txt for token in s if token.txt)) assert txt == correct for sent, correct in SENT_KLUDGY_ORDINALS_TRANSLATE: s = t.tokenize(sent, handle_kludgy_ordinals=t.KLUDGY_ORDINALS_TRANSLATE) txt = t.correct_spaces(" ".join(token.txt for token in s if token.txt)) assert txt == correct for sent, correct in SENT_CONVERT_TELNOS: s = t.tokenize(sent, convert_telnos=True) txt = t.correct_spaces(" ".join(token.txt for token in s if token.txt)) assert txt == correct for sent, correct in SENT_CONVERT_NUMBERS: s = t.tokenize(sent, convert_numbers=True) txt = t.correct_spaces(" ".join(token.txt for token in s if token.txt)) assert txt == correct