Example #1
0
def refactor_labels_gilyon_sub_citation():
    dbm = MongoProdigyDBManager("gilyon_sub_citation_output")
    for doc in dbm.output_collection.find({}):
        for span in doc['spans']:
            if span['label'] in {'ספר', 'תת-ספר'}:
                span['label'] = 'כותרת'
        dbm.output_collection.replace_one({"_id": doc['_id']}, doc)
Example #2
0
def refactor_tos_dh_to_be_named():
    tos_dh_titles = {'תוד"ה', 'תד"ה'}
    dbm = MongoProdigyDBManager("gilyon_sub_citation_output")
    for doc in dbm.output_collection.find({}):
        new_spans = []
        for span in doc['spans']:
            span_tokens = [
                doc['tokens'][i]
                for i in range(span['token_start'], span['token_end'] + 1)
            ]
            if span['label'] == 'דה' and span_tokens[0][
                    'text'] in tos_dh_titles:
                assert len(span_tokens) == 2
                for tok in span_tokens:
                    new_spans += [{
                        "start":
                        tok["start"],
                        "end":
                        tok["end"],
                        "token_start":
                        tok["id"],
                        "token_end":
                        tok["id"],
                        "label":
                        "כותרת" if tok["text"] in tos_dh_titles else "דה"
                    }]
            else:
                new_spans += [span]
        doc['spans'] = new_spans
        dbm.db.gilyon_sub_citation_output2.replace_one({"_id": doc['_id']},
                                                       doc,
                                                       upsert=True)
Example #3
0
def make_csv_by_filter(input_collection, output_file, filter_func):
    rows = []
    my_db = MongoProdigyDBManager('blah')
    max_token_len = 0
    for doc in getattr(my_db.db, input_collection).find({}):
        for span in doc['spans']:
            span_text = doc['text'][span['start']:span['end']]
            is_relevant = filter_func(span_text, span)
            if not is_relevant:
                continue
            for is_text in (True, False):
                temp_row = {
                    "Text": span_text if is_text else '',
                    "Ref": doc['meta']['Ref'],
                }
                token_len = span['token_end'] - span['token_start'] + 1
                if token_len > max_token_len:
                    max_token_len = token_len
                for i, itoken in enumerate(
                        range(span['token_start'], span['token_end'] + 1)):
                    token = doc['tokens'][itoken]
                    temp_row[f'Token {i}'] = token[
                        'text'] if is_text else f'{token["start"]}|{token["end"]}|{token["id"]}'
                rows += [temp_row]
    with open(output_file, 'w') as fout:
        c = csv.DictWriter(fout, ['Ref', 'Text'] +
                           [f'Token {i}' for i in range(max_token_len)])
        c.writeheader()
        c.writerows(rows)
Example #4
0
def fix_perek():
    db_mng = MongoProdigyDBManager("Copy_of_examples2_output")
    curr_ref = None
    nlp, model_exists = load_model(
        './research/prodigy/output/ref_tagging_cpu/model-last', ['מקור'])
    tokenizer = custom_tokenizer_factory()(nlp)
    with open("/home/nss/Downloads/fix_perek.txt", "r") as fin:
        for line in fin:
            line = line.strip()
            if curr_ref is None:
                curr_ref = line
            elif len(line) == 0:
                curr_ref = None
            else:
                doc = db_mng.output_collection.find_one({"meta.Ref": curr_ref})
                if doc is None:
                    print("oh no", curr_ref)
                mult_split = line.split('x')
                if len(mult_split) == 1:
                    mult = 1
                else:
                    line, mult = mult_split
                    mult = int(mult)
                words = [t.text for t in tokenizer(line)]
                already_matched_inds = reduce(lambda a, b: a | b, [
                    set(range(span['token_start'], span['token_end']))
                    for span in doc['spans']
                ], set())
                match_inds = get_all_sequence_match_inds(
                    [t['text'] for t in doc['tokens']], words)
                match_inds = list(
                    filter(
                        lambda x: len(
                            set(range(x[0], x[1])) & already_matched_inds) ==
                        0, match_inds))
                if len(match_inds) > mult:
                    print(line, curr_ref, words, mult)
                    continue
                # All checks have passed! Do edit
                for token_start, token_end in match_inds:
                    start_token = next(x for x in doc['tokens']
                                       if x['id'] == token_start)
                    end_token = next(x for x in doc['tokens']
                                     if (x['id'] == token_end - 1))
                    doc['spans'] += [{
                        "label": "מקור",
                        "start": start_token['start'],
                        "end": end_token['end'],
                        "token_start": token_start,
                        "token_end": token_end - 1  # classic off-by-one...
                    }]
                db_mng.output_collection.update_one(
                    {"_id": doc['_id']}, {"$set": {
                        "spans": doc['spans']
                    }})
Example #5
0
def merge_gold_full_into_silver_binary():
    import random
    gold_db = MongoProdigyDBManager("gold_output_full")
    silver_db = MongoProdigyDBManager("silver_output_binary")
    for gold in gold_db.output_collection.find({}):
        gold['_view_id'] = 'ner'
        spans = set()
        for ispan, span in enumerate(gold['spans']):
            binary_gold = gold.copy()
            binary_gold['spans'] = [span]
            if ispan > 0:
                del binary_gold['tokens']
            del binary_gold['_id']
            silver_db.output_collection.insert_one(binary_gold)
            spans.add((span['token_start'], span['token_end']))
        new_spans = []
        while len(new_spans) < len(gold['tokens']) / 100:
            if len(gold['tokens']) < 20: break
            rand_token_start = random.choice(range(len(gold['tokens']) - 6))
            new_span = (rand_token_start,
                        rand_token_start + random.choice(range(3, 6)))
            if new_span in spans or new_span[1] >= len(gold['tokens']):
                continue
            spans.add(new_span)
            new_spans += [new_span]
            binary_gold = gold.copy()
            start_char = next(x for x in gold['tokens']
                              if x['id'] == new_span[0])
            end_char = next(x for x in gold['tokens']
                            if (x['id'] == new_span[1]))
            binary_gold['spans'] = [{
                "start": start_char['start'],
                "end": end_char['end'],
                "token_start": new_span[0],
                "token_end": new_span[1],
                "label": "מקור"
            }]
            del binary_gold['tokens']
            del binary_gold['_id']
            binary_gold['answer'] = 'reject'
            silver_db.output_collection.insert_one(binary_gold)
Example #6
0
def combine_all_sentences_to_paragraphs():
    my_db = MongoProdigyDBManager('localhost', 27017)
    examples = my_db.db.examples
    combined_examples = []
    examples_by_ref = defaultdict(list)
    for example in examples.find({}):
        examples_by_ref[example['meta']['Ref']] += [example]
    combined_examples = [
        combine_sentences_to_paragraph(sentences)
        for sentences in examples_by_ref.values()
    ]
    my_db.db.examples1_input.delete_many({})
    my_db.db.examples1_input.insert_many(combined_examples)
Example #7
0
def make_prodigy_input_sub_citation(citation_collection, output_collection):
    my_db = MongoProdigyDBManager('blah', 'localhost', 27017)
    getattr(my_db.db, output_collection).delete_many({})
    for doc in getattr(my_db.db, citation_collection).find({}):
        for span in doc['spans']:
            span_text = doc['text'][span['start']:span['end']]
            getattr(my_db.db, output_collection).insert_one({
                "text": span_text,
                "spans": [],
                "meta": {
                    "Ref": doc['meta']['Ref'],
                    "Start": span['start'],
                    "End": span['end']
                }
            })
Example #8
0
def modify_data_based_on_csv(in_file, input_collection, output_collection):
    my_db = MongoProdigyDBManager(output_collection)
    docs = getattr(my_db.db, input_collection).find({})
    span_map = defaultdict(list)
    input_keys = set()
    with open(in_file, 'r') as fin:
        rows = csv.DictReader(fin)
        for text_row in rows:
            num_row = next(rows)
            start, end, token_start = num_row["Token 0"].split('|')
            token_end = token_start
            key = f'{text_row["Ref"]}|{text_row["Text"]}|{start}|{token_start}'
            icol = 0
            while True:
                col_data = num_row.get(f'Token {icol}', None)
                if col_data is None or len(col_data) == 0:
                    break

                _, end, token_end = col_data.split('|')
                icol += 1
            span_map[key] += [(start, end, token_start, token_end)]
            input_keys.add(key)
    used_keys = set()
    getattr(my_db.db, output_collection).delete_many({})
    for doc in docs:
        for span in doc['spans']:
            full_text = doc['text'][span['start']:span['end']]
            key = f'{doc["meta"]["Ref"]}|{full_text}|{span["start"]}|{span["token_start"]}'
            span_list = span_map[key]
            if len(span_list) == 1:
                start, end, token_start, token_end = span_list[0]
                span['start'] = int(start)
                span['end'] = int(end)
                span['token_start'] = int(token_start)
                span['token_end'] = int(token_end)
                used_keys.add(key)
        getattr(my_db.db, output_collection).insert_one(doc)
    for key in input_keys:
        if key not in used_keys:
            print("unused", key)
Example #9
0
def move_binary_output_to_own_collection():
    my_db = MongoProdigyDBManager('blah')
    binary_output = list(my_db.db.examples2_output.find({"_view_id": "ner"}))
    my_db.db.examples2_binary.insert_many(binary_output)
Example #10
0
def merge_into_silver_full(collection):
    gold_db = MongoProdigyDBManager(collection)
    silver_db = MongoProdigyDBManager("silver_output_full")
    for gold in gold_db.output_collection.find({}):
        silver_db.output_collection.insert_one(gold)
Example #11
0
def get_prev_tagged_refs(collection):
    my_db = MongoProdigyDBManager(collection, 'localhost', 27017)
    return set(my_db.output_collection.find({}).distinct('meta.Ref'))