def init_database(db, input_path, exist_annotation, db_id, batch_size): with open(input_path, 'r') as infile: json_obj = json.load(infile) db_name = "WebNLG_" + str(db_id) dataset = Dataset(name=db_name) db.session.add(dataset) db.session.commit() example_id = 300 for obj in json_obj: for i in range(3): if obj['ID'] in exist_annotation \ and i in exist_annotation[obj['ID']]: continue if example_id > 300 and example_id % batch_size == 0: db_id += 1 db_name = "WebNLG_" + str(db_id) dataset = Dataset(name=db_name) db.session.add(dataset) db.session.commit() example = Example(id=example_id, dataset_id=dataset.id, ex_id=obj['ID'], tgt_id=i, src_json=json.dumps(obj['SRC']), tgt_json=json.dumps(obj['TGT-' + str(i)]), sanity_check=json.dumps(obj['CHK-' + str(i)])) db.session.add(example) db.session.commit() example_id += 1 return example_id
def test_name_too_short(): """ Ensure that dataset names with length < 2 lead to validation error """ min_length = 2 Dataset(name="a" * min_length) for i in range(min_length): with pytest.raises(ValidationError): Dataset(name="a" * i)
def one_split(db, idx, sanity_data): # Insert dataset dataset = Dataset(name="ALG_FACT"+str(idx)) db.session.add(dataset) db.session.commit() summaries_path = os.path.join(dataset_path, 'summaries') documents_path = os.path.join(dataset_path, 'documents') for doc_id in sanity_data: file_name = doc_id + ".data" file_path = os.path.join(documents_path, file_name) summ_path = os.path.join(summaries_path, file_name) with open(summ_path, 'r') as infile: summ_json = json.load(infile) with open(file_path, 'r') as infile: json_result = json.load(infile) did = json_result['doc_id'] for i, item in enumerate(summ_json): if item['name'].find("|||") == -1: continue if example_filter(item['text']): continue document = Document( dataset_id=dataset.id, doc_id=json_result['doc_id'], doc_json=json.dumps(json_result), summary=json.dumps(item), sanity_statement=sanity_data[did]["sanity_statement"], sanity_answer=sanity_data[did]["sanity_answer"] ) db.session.add(document) db.session.commit()
def init_database(db): # user = User(email='admin@localhost', password='******') # db.session.add(user) # db.session.commit() dataset_path = '../backend/BBC' dataset_name = os.path.split(dataset_path)[1] summaries_path = os.path.join(dataset_path, 'summaries') documents_path = os.path.join(dataset_path, 'documents') # Existing dataset #dataset = db.session.query(Dataset).filter_by(name='BBC').first() # Insert dataset dataset = Dataset(name="BBC_test") db.session.add(dataset) db.session.commit() # Insert documents for file in os.listdir(documents_path): file_path = os.path.join(documents_path, file) with open(file_path, 'r') as infile: json_result = json.load(infile) document = Document(dataset_id=dataset.id, doc_id=json_result['doc_id'], doc_json=json.dumps(json_result), summary="aaaaaaa") db.session.add(document) db.session.commit() # Insert Summaries for folder in os.listdir(summaries_path): if folder.startswith('ref'): summary_group = SummaryGroup(name='%s_ref_%s' % (dataset_name, folder[4:]), dataset_id=dataset.id, is_ref=True) elif folder.startswith('system'): summary_group = SummaryGroup(name='%s_system_%s' % (dataset_name, folder[7:]), dataset_id=dataset.id, is_ref=False) else: break db.session.add(summary_group) db.session.commit() ref_path = os.path.join(summaries_path, folder) for file in os.listdir(ref_path): with open(os.path.join(ref_path, file), 'r') as infile: text = ' '.join(infile.readlines()).strip() document = db.session.query(Document).filter_by( doc_id=os.path.splitext(file)[0]).first() summary = Summary(doc_id=document.id, text=text, summary_group_id=summary_group.id) db.session.add(summary) db.session.commit()
def init_database(db): # user = User(email='admin@localhost', password='******') # db.session.add(user) # db.session.commit() dataset_path = '../backend/BBC_pair' dataset_name = os.path.split(dataset_path)[1] summaries_path = os.path.join(dataset_path, 'summaries') documents_path = os.path.join(dataset_path, 'documents') sanity_path = os.path.join(dataset_path, 'sanity_id/sanity.txt') # Existing dataset #dataset = db.session.query(Dataset).filter_by(name='BBC').first() # Insert dataset dataset = Dataset(name="BBC") db.session.add(dataset) db.session.commit() sanity_data = {} for line in open(sanity_path): flist = line.strip().split("\t") sanity_data[flist[0]] = { "sanity_answer": bool(int(flist[2])), "sanity_statement": flist[1] } # Insert documents for file in os.listdir(documents_path): file_path = os.path.join(documents_path, file) summ_path = os.path.join(summaries_path, file) with open(summ_path, 'r') as infile: summ_json = json.load(infile) with open(file_path, 'r') as infile: json_result = json.load(infile) did = json_result['doc_id'] for i, item in enumerate(summ_json): document = Document( dataset_id=dataset.id, doc_id=json_result['doc_id'], doc_json=json.dumps(json_result), summary=json.dumps(item), sanity_statement=sanity_data[did]["sanity_statement"], sanity_answer=sanity_data[did]["sanity_answer"]) db.session.add(document) db.session.commit()
def init_database(db, input_path, db_id, example_id): # Insert dataset db_name = "WebNLG_" + str(db_id) dataset = Dataset(name=db_name) db.session.add(dataset) db.session.commit() with open(input_path, 'r') as infile: json_obj = json.load(infile) for obj in json_obj: for i in range(3): example_id += 1 example = Example(id=example_id, dataset_id=dataset.id, ex_id=obj['ID'], tgt_id=i, src_json=json.dumps(obj['SRC']), tgt_json=json.dumps(obj['TGT-' + str(i)]), sanity_check=json.dumps(obj['CHK-' + str(i)])) db.session.add(example) db.session.commit() return example_id
def init_database(db): user = User(email='admin@localhost', password='******') db.session.add(user) db.session.commit() dataset_path = '/home/acp16hh/Projects/Research/Experiments/Exp_Elly_Human_Evaluation/src/Mock_Dataset_2/BBC_Sample' # dataset_path = '/home/acp16hh/Projects/Research/Experiments/Exp_Elly_Human_Evaluation/src/Mock_Dataset_2/BBC' dataset_name = os.path.split(dataset_path)[1] summaries_path = os.path.join(dataset_path, 'summaries') documents_path = os.path.join(dataset_path, 'documents') # Existing dataset # dataset = db.session.query(Dataset).filter_by(name='BBC').first() # Insert dataset dataset = Dataset(name=dataset_name) db.session.add(dataset) db.session.commit() # Insert documents for file in os.listdir(documents_path): file_path = os.path.join(documents_path, file) with open(file_path, 'r') as infile: json_result = json.load(infile) document = Document( dataset_id=dataset.id, doc_id=json_result['doc_id'], doc_json=json.dumps(json_result) ) db.session.add(document) db.session.commit() # Insert Summaries for folder in os.listdir(summaries_path): if folder.startswith('ref'): summary_group = SummaryGroup(name='%s_ref_%s' % (dataset_name, folder[4:]), dataset_id=dataset.id, is_ref=True) elif folder.startswith('system'): summary_group = SummaryGroup(name='%s_system_%s' % (dataset_name, folder[7:]), dataset_id=dataset.id, is_ref=False) else: break db.session.add(summary_group) db.session.commit() ref_path = os.path.join(summaries_path, folder) for file in os.listdir(ref_path): with open(os.path.join(ref_path, file), 'r') as infile: text = ' '.join(infile.readlines()).strip() document = db.session.query(Document).filter_by(doc_id=os.path.splitext(file)[0]).first() summary = Summary( doc_id=document.id, text=text, summary_group_id=summary_group.id ) db.session.add(summary) db.session.commit() # Insert Pairs ref_summary_groups = db.session.query(SummaryGroup).filter_by(dataset_id=dataset.id, is_ref=True).all() system_summary_groups = db.session.query(SummaryGroup).filter_by(dataset_id=dataset.id, is_ref=False).all() for ref_summ_group in ref_summary_groups: for system_summ_group in system_summary_groups: for system_summary in system_summ_group.summaries: ref_summary = db.session.query(Summary)\ .filter_by(summary_group_id=ref_summ_group.id, doc_id=system_summary.doc_id).first() summaries_pair = SummariesPair( ref_summary_id=ref_summary.id, system_summary_id=system_summary.id, dataset_id=dataset.id ) db.session.add(summaries_pair) db.session.commit()
def test_empty_description(): """ Ensure that an empty description is allowed """ Dataset(name="demo", description="")
def test_description_too_long(): """ Ensure that dataset descriptions with length > 256 lead to validation error """ max_length = 256 Dataset(name="demo", description="a" * max_length) with pytest.raises(ValidationError): Dataset(name="demo", description="a" * (max_length + 1))
def test_name_too_long(): """ Ensure that dataset names with length superior to 64 lead to validation error """ max_length = 64 Dataset(name="a" * max_length) with pytest.raises(ValidationError): Dataset(name="a" * (max_length + 1))
def test_name_missing_error(): with pytest.raises(ValidationError): Dataset(visibility="something_else")
def test_visibility_validation_error(): with pytest.raises(ValidationError): Dataset(name="demo", visibility="something_else")
def test_public_visibility(): dataset = Dataset(name="demo", visibility="public") assert dataset.visibility == DatasetVisibility.PUBLIC assert dataset.visibility == "public"
def test_private_visibility(): dataset = Dataset(name="demo", visibility="private") assert dataset.visibility == DatasetVisibility.PRIVATE assert dataset.visibility == "private"
def test_default(): dataset = Dataset(name="demo") assert dataset.visibility == DatasetVisibility.PUBLIC assert dataset.visibility == "public" assert dataset.description is None