def refine_corpus(corpus, rule_path, output=None, thread=None): """ Clean up the given corpus according to the rules defined in the files. This method utilizes multithreading to accelerate the process. Arguments: corpus(str): Path to the corpus file. rule_path(str): Path to where "parentheses.tsv" and "refine_list.tsv" are. thread(int): Number of thread to process. output(str): Path to the output file. """ if output is None: output = corpus[:-4] + "_cleaned.txt" if not rule_path.endswith("/"): rule_path += "/" # Load rule files file_p = rule_path + "parentheses.tsv" file_r = rule_path + "refine_list.tsv" parentheses = load_rules(file_p) refine_list = load_rules(file_r) # Acquire the corpus (skip first line) raw_data = readlines(corpus) # Threading param = (parentheses, refine_list) result = generic_threading(thread, raw_data, corpus_cleanup, param) # Write all result to file write_to_file(output, result)
def extract_vocabularies(corpus, rule, output=None, thread=None): """ Extract vocabularies from the corpus, additional rules to achieve purer vocabularies can be defined in src/refine_rules/voc_cleanup.tsv Arguments: corpus(str): Path to the corpus file. rule(str): Path to the processing rule file. thread(int): Number of thread to process. output(str): Path to the output file. """ if output is None: output = corpus[:-4] + "_vocabulary_list.json" # Load rules rules = load_rules(rule) # Acquire the corpus raw_data = readlines(corpus, limit=None) # Threading (TO-BE-IMPLEMENTED) # param = (rules, "SPLIT_WORDS") # generic_threading(thread, raw_data, punctuation_cleanup, param) result = punctuation_cleanup(0, raw_data, rules, mode='SPLIT_WORDS') # Counting occurance print("Counting occurance...") voc_list = Counter(result) # Save vocabulary to file write_to_file(output, voc_list)
def preliminary_cleanup(corpus, rule, output=None, thread=None, limit=None): """ Preliminary cleanup the corpus to make it easier for further processing methods. This method can be used to correct the missing spaces after punctuations any other customized rules can be added to the rule file, see punctuation_cleanup in utils for the formatting of the rules. Arguments: corpus(str): Path to the corpus file. rule(str): Path to the processing rule file. thread(int): Number of thread to process. output(str): Path to the output file. """ # output name if output is None: output = corpus[:-4] + "_preprocessed.tsv" # Load rules rules = load_rules(rule) # Load data raw_data = readlines(corpus, limit=limit, skip=True) # Threading param = (rules, "PRELIMINARY") result = generic_threading(thread, raw_data, punctuation_cleanup, param) # Write result to file write_to_file(output, result)
def test_exclude_word(engine): # Rust engine doesn't work here, because Re2 doesn't support backtracking operator parser = engine(load_rules("examples/excluding-word.rita")) t1 = "weather is awesome" t2 = "weather is cold" r1 = parser(t1) r2 = parser(t2) assert r1[0] == ("weather is awesome", "GOOD_WEATHER") assert len(r2) == 0
def test_dash_case(engine): parser = engine(load_rules("examples/dress-match.rita")) text = """ Fitted, knee-length dress in soft velour """ entities = set(parser(text)) print(entities) expected = set([ ("Fitted, knee-length dress", "DRESS_TYPE"), ("soft velour", "DRESS_FABRIC"), ]) assert entities.issuperset(expected)
def test_color_car(engine): text = "John Silver was driving a red car. It was BMW X6 Mclass. John likes driving it very much." parser = engine(load_rules("examples/color-car.rita")) entities = set(parser(text)) print(entities) expected = set([ ("John Silver", "PERSON"), # Normal NER ("red car", "CAR_COLOR"), # Our first rule ("BMW X6 Mclass", "CAR_MODEL"), # Our second rule ("John likes driving", "LIKED_ACTION") # Our third rule ]) assert entities.issuperset(expected)
def test_benchmark(benchmark, engine, bench_text): """ These tests will only run if parameters: `--benchmark-enable` or `--benchmark-only` are added """ parser = engine(load_rules("examples/cheap-phones.rita")) def parse_rows(parser, rows): for r in rows: parser(r) benchmark.pedantic(parse_rows, args=(parser, bench_text), iterations=3, rounds=3)
def test_escape_string(engine): # If it compiles - good enough engine(load_rules("examples/match-with-escaped-string.rita"))
def get_rules(self): return load_rules(self.rulesname)
def __init__(self, config, iface_int, iface_ext): self.iface_int = iface_int self.iface_ext = iface_ext self.geodb = load_geodb() self.rules = load_rules(config['rule'], self.geodb)