def scrape(self): name = 'FloridaGov' url = 'https://floridahealthcovid19.gov/frequently-asked-questions/' html = requests.get(url).text soup = BeautifulSoup(html, "lxml") questions = [ str(q) for q in soup.findAll("h4", {"class": "panel-title"}) ] answers = [ str(a) for a in soup.findAll("div", {"class": "panel-body"}) ] converter = Conversion(self._filename, self._path) for question, answer in zip(questions, answers): converter.addExample({ 'sourceUrl': url, 'sourceName': name, "needUpdate": True, "typeOfInfo": "QA", "isAnnotated": False, "responseAuthority": "", "question": question, "answer": answer, "hasAnswer": True, "targetEducationLevel": "NA", "topic": [], "extraData": {}, "targetLocation": "Florida", "language": "en" }) return converter.write()
def scrape(self): name = 'Delaware State Government' url = 'https://coronavirus.delaware.gov/what-delawareans-can-do/#faqs' html = requests.get(url).text soup = BeautifulSoup(html, "lxml") questions = [ str(q) for q in soup.findAll("h4", {"class": "panel-title"}) ] answers = [ str(a) for a in soup.findAll("div", {"class": "panel-body"}) ] converter = Conversion(self._filename, self._path) for question, answer in zip(questions, answers): converter.addExample({ 'sourceUrl': url, 'sourceName': name, "needUpdate": True, "typeOfInfo": "QA", "isAnnotated": False, "responseAuthority": "", "question": question, "answer": answer, "hasAnswer": True, "targetEducationLevel": "NA", "topic": [], "extraData": {}, "targetLocation": "Delaware", "language": "en" }) return converter.write()
def scrape(self): name = 'North Dakota Stake Government' url = 'https://ndresponse.gov/covid-19-resources/covid-19-faqs' html = requests.get(url).text soup = BeautifulSoup(html, "lxml").findAll( 'div', {'class': 'view-content'})[4].findAll('div', {'class': 'views-row'}) soup = [x.findAll('div', {'class': 'views-row'}) for x in soup] soup = list(itertools.chain.from_iterable(soup)) questions = list(map(self._extract_question, soup)) answers = list(map(self._extract_answer, soup)) converter = Conversion(self._filename, self._path) for question, answer in zip(questions, answers): converter.addExample({ 'sourceUrl': url, 'sourceName': name, "needUpdate": True, "typeOfInfo": "QA", "isAnnotated": False, "responseAuthority": "", "question": question, "answer": answer, "hasAnswer": True, "targetEducationLevel": "NA", "topic": [], "extraData": {}, "targetLocation": "North Dakota", "language": "en" }) return converter.write()
def scrape(self): name = 'Kansas Department of Health and Enviroment' url = 'https://ks-kdhecovid19.civicplus.com/faq.aspx' html = requests.get(url).text soup = BeautifulSoup(html, "lxml").find('div', { 'id': 'modulecontent' }).findAll('dl') questions = list(map(self._extract_question, soup)) answers = list(map(self._extract_answer, soup)) converter = Conversion(self._filename, self._path) for question, answer in zip(questions, answers): converter.addExample({ 'sourceUrl': url, 'sourceName': name, "needUpdate": True, "typeOfInfo": "QA", "isAnnotated": False, "responseAuthority": "", "question": question, "answer": answer, "hasAnswer": True, "targetEducationLevel": "NA", "topic": [], "extraData": {}, "targetLocation": "Kansas", "language": "en" }) return converter.write()
def test_key_exception(self): with self.assertRaises(KeyError) as ke: converter = Conversion('test', '.') converter.addExample({ 'sourceUrl': 'example.com', 'language': 'en', })
def test_value_exception(self): with self.assertRaises(ValueError) as ve: converter = Conversion('test', '.') converter.addExample({ 'sourceUrl': ['example.com'], "language": 'en', })
def scrape(self): name = 'Vermont Department of Health' url = 'https://apps.health.vermont.gov/COVID/faq/' html = requests.get(url).text lastUpdateTime = time.mktime(dateparser.parse(BeautifulSoup(html, "lxml").find('p', {'class' : 'subtitle'})\ .getText().split('Updated:')[1].strip()).timetuple()) soup = BeautifulSoup(html, "lxml").find('ul', {'class' : 'topics'}).findAll('li', {'class' : 'faq'}) questions = list(map(self._extract_question, soup)) answers = list(map(self._extract_answer, soup)) converter = Conversion( self._filename, self._path) for question, answer in zip(questions, answers): converter.addExample({ 'sourceUrl': url, 'sourceName': name, "needUpdate": True, "typeOfInfo": "QA", "isAnnotated": False, "responseAuthority": "", "question": question, "answer": answer, "hasAnswer": True, "targetEducationLevel": "NA", "topic": [], "extraData": {}, "targetLocation": "Vermont", "language": "en" }) return converter.write()
def scrape(self): scraper_list = [ Arbeitsagentur, BAUA, BMAS, BMG, BMWI, BVF, BZgA, BerlinerSenat, Bundesregierung, CDC_Children, CDC_Individuals, CDC_Travel, CDC_Water, ECDC, FHM_EN, FHM_SV, GOV_pl, IHK, KVB, RKI, Salute_IT, #UNICEF, WHO, ] logger = logging.getLogger(__name__) logging.disable(logging.WARNING) process = CrawlerProcess({ 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', 'ITEM_PIPELINES': { '__main__.Pipeline': 1 } }) for crawler in scraper_list: process.crawl(crawler) process.start() df = pd.concat(RESULTS) converter = Conversion(self._filename, self._path) for _, row in df.iterrows(): converter.addExample({ 'sourceUrl': row.link, 'sourceName': row.source, "needUpdate": True, "typeOfInfo": "QA", "isAnnotated": False, "responseAuthority": "", "question": row.question, "answer": row.answer_html, "hasAnswer": bool(row.answer), "targetEducationLevel": "NA", "topic": [], "extraData": {}, "targetLocation": row.country, "language": row.lang, }) return converter.write()
def scrape(self): Block = namedtuple('Block', 'content tags') extra_data = {} url = 'https://www.cnn.com/interactive/2020/health/coronavirus-questions-answers/' page = requests.get(url) soup = BeautifulSoup(page.content, 'html.parser') lastUpdatedTime = time.mktime( dateparser.parse(' '.join( soup.find('div', { 'class': 'cnnix-timestamp' }).getText().split()[1:]), date_formats=['%B %d, %Y, %I %p']).timetuple()) tags = [ tag.get('data-topic') for tag in soup.find_all('div', attrs={'class': 'nav-button'}) ] body = soup.find_all('div', attrs={'class': 'interactive-container'})[1] blocks = [] for div in body.find_all('div'): if 'question' == div.get('class')[0]: tags = div.get('class')[1:] block = Block(div, tags) blocks.append(block) questions, answers, topics = [], [], [] for block in blocks: question = block.content.find('div', attrs={'class': 'question-q'}) answer = block.content.find('div', attrs={'class': 'question-a'}) questions.append(str(question)) answers.append(str(answer)) topics.append(block.tags) converter = Conversion(self._filename, self._path) for q, a, t in zip(questions, answers, topics): converter.addExample({ 'sourceUrl': url, 'sourceName': "CNN", "needUpdate": True, "typeOfInfo": "QA", "isAnnotated": False, "responseAuthority": "", "question": q, "answer": a, "hasAnswer": a is not None, "targetEducationLevel": "NA", "topic": t, "extraData": {}, "targetLocation": "United States", "language": 'en', }) return converter.write()
def scrape(self): name = 'Cleveland Clinic' url = 'https://newsroom.clevelandclinic.org/2020/03/18/frequently-asked-questions-about-coronavirus-disease-2019-covid-19/' headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} r = requests.get(url, headers=headers) soup = BeautifulSoup(r.text, "lxml") faq = soup.find("div", {"class": "entry-content"}) answers, questions = [], [] q = '' a = '' for e in faq.findAll(recursive=False): if e.name == 'h5': if q and a: questions.append(q.replace('Q:', '')) answers.append(a.replace('A:', '')) q = str(e) a = '' else: a += " " + str(e) if q and a: questions.append(q.replace('Q:', '')) answers.append(a.replace('A:', '')) lastUpdateTime = time.mktime( dateparser.parse( soup.find( "h3", {"entry-sub-title"}).getText().strip().replace("Updated ", "")).timetuple()) converter = Conversion( self._filename, self._path) for question, answer in zip(questions, answers): converter.addExample({ 'sourceUrl': url, 'sourceName': name, "needUpdate": True, "typeOfInfo": "QA", "isAnnotated": False, "responseAuthority": "", "question": question, "answer": answer, "hasAnswer": True, "targetEducationLevel": "NA", "topic": [], "extraData": {}, "targetLocation": "Cleveland", "language": "en" }) return converter.write()
def scrape(self): name = 'Texas Human Resources' url = 'https://www.dshs.state.tx.us/coronavirus/faq.aspx' html = requests.get(url, verify=False).text soup = BeautifulSoup(html, "lxml") # faq is in the second div faq = soup.find( "div", { "id": "ctl00_ContentPlaceHolder1_uxContent"}).findAll( "div", recursive=False)[1] lastUpdateTime = time.mktime( time.strptime( soup.find( "span", {"lastUpdatedDate"}).getText().strip(), "%B %d, %Y")) questions, answers = [], [] a = '' begun = False for e in faq.findAll(recursive=False): if e.name == 'h3': if begun: questions.append(q) answers.append(a) q = str(e) a = '' begun = True elif e.name == 'p' or e.name == 'ul': a += str(e) questions.append(q) answers.append(a) converter = Conversion( self._filename, self._path) for question, answer in zip(questions, answers): converter.addExample({ 'sourceUrl': url, 'sourceName': name, "needUpdate": True, "typeOfInfo": "QA", "isAnnotated": False, "responseAuthority": "", "question": question, "answer": answer, "hasAnswer": True, "targetEducationLevel": "NA", "topic": [], "extraData": {}, "targetLocation": "Texas", "language": 'en' }) return converter.write()
def scrape(self): hub_links_to_scrape = ['https://hub.jhu.edu/2020/03/30/andrew-pekosz-immunity-seasonality/?fbclid=IwAR2LUcjr7Ltz6koe0IjRV3gr7E3tW0K6hqlcaYPtKQz3HBmjlQ7YRGrtgHw', 'https://hub.jhu.edu/2020/03/23/how-to-self-quarantine-self-isolate/?mc_cid=0ed1a231a3&mc_eid=9687fd9d33'] success = True for link in hub_links_to_scrape: faqs, lastUpdateTime = self._scrape(link) converter = Conversion( self._filename, self._path) for faq in faqs: converter.addExample(faq) success &= converter.write() return success
def scrape(self): converter = Conversion( self._filename, self._path) df = pd.read_csv(open("COVID19infosheet - Info.tsv", 'r'), sep="\t") df = self._clean_headers(df) df['json'] = df.apply(self._prepare_data, axis=1) for obj in df['json']: if not obj['hasAnswer']: continue converter.addExample(obj) return converter.write()
def scrape(self): chrome_driver_path = os.environ['CHROME_DRIVER_PATH'] name = 'Oregon Public Health Division' url = 'https://www.oregon.gov/oha/PH/DISEASESCONDITIONS/DISEASESAZ/Pages/COVID19-FAQ.aspx?wp1284=l:100' opts = Options() opts.set_headless() driver = webdriver.Chrome(executable_path=chrome_driver_path, chrome_options=opts) driver.get(url) try: WebDriverWait(driver, 60).until(ec.presence_of_element_located((By.TAG_NAME, 'td'))) except: return False html = driver.page_source soup = BeautifulSoup(html, 'lxml') questions = soup.findAll('td', {'data-title': 'Question'}) answers = soup.findAll('td', {'data-title': 'Answer'}) topics = soup.findAll('td', {'data-title': 'Topic'}) lastUpdateTime = time.time() converter = Conversion( self._filename, self._path) for t, q, a in zip(topics, questions, answers): topic = self._extract_topic(t) question = self._extract_question(q) answer = self._extract_answer(a) converter.addExample({ 'sourceUrl': url, 'sourceName': name, "needUpdate": True, "typeOfInfo": "QA", "isAnnotated": False, "responseAuthority": "", "question": question, "answer": answer, "hasAnswer": True, "targetEducationLevel": "NA", "topic": [topic], "extraData": {}, "targetLocation": "Oregon", "language": "en" }) driver.quit() return converter.write()
def scrape(self): url = 'https://www.who.int/emergencies/diseases/novel-coronavirus-2019/advice-for-public/myth-busters' html = urlopen(url) soup = BeautifulSoup(html, "lxml") qas_plus_some = soup.find_all( 'div', class_='sf-content-block content-block') qa_pairs = [] for potential in qas_plus_some: for child in potential.children: if "h2" in str( child): # Super hacky ... but this seemed to be the best way for this site s_child = str(child) s_child = s_child.replace("\n", " ") s_child = s_child.replace(u'\xa0', u' ') qa = s_child.split("</h2>") if len(qa) == 2: question = str(qa[0]) answer = str(qa[1]) elif len(qa) == 3: # First question is different question = str(qa[1]) answer = str(qa[2]) else: print("ERROR:") # TODO: better error handling? qa_pairs.append((question, answer)) converter = Conversion( self._filename, self._path) for pair in qa_pairs: converter.addExample({ "sourceName": 'WHOMyth', "sourceUrl": url, "typeOfInfo": 'QA', "needUpdate": True, "typeOfInfo": 'QA', "isAnnotated": False, "responseAuthority": "", "question": pair[0], "answer": pair[1], "hasAnswer": True, "targetEducationLevel": 'NA', "topic": ["Myths"], "extraData": {}, "targetLocation": "", "language": 'en' }) return converter.write()
def scrape(self): name = 'Hawaii State Government' url = 'https://health.hawaii.gov/coronavirusdisease2019/what-you-should-know/faqs/' html = requests.get(url).text soup = BeautifulSoup(html, "lxml") questions = [str(q) for q in soup.findAll("h3")] answers = [] for q in soup.findAll("h3"): a = "" for tag in q.next_siblings: if tag.name == "div": break else: a += str(tag) answers.append(a) lastUpdate = time.mktime(dateparser.parse(' '.join(soup.find( 'em').getText().split()[1:]), date_formats=['%B %d, %Y, %I %p']).timetuple()) converter = Conversion( self._filename, self._path) for question, answer in zip(questions, answers): converter.addExample({ 'sourceUrl': url, 'sourceName': name, "needUpdate": True, "typeOfInfo": "QA", "isAnnotated": False, "responseAuthority": "", "question": question, "answer": answer, "hasAnswer": True, "targetEducationLevel": "NA", "topic": [], "extraData": {}, "targetLocation": "Hawaii", "language": "en" }) return converter.write()
def scrape(self): name = 'FDA' url = 'https://www.fda.gov/emergency-preparedness-and-response/mcm-issues/coronavirus-disease-2019-covid-19-frequently-asked-questions' html = urlopen(url) soup = BeautifulSoup(html, "lxml") questions, answers = [], [] for panelgroup in soup.findAll("div", {"class": "panel-group"}): for qa in panelgroup.findAll('div', {"class": "panel"}): q = str(qa.find("div", {"class": "panel-heading"})).replace('Q:', '') a = str(qa.find("div", {"class": "panel-body"})).replace('A:', '') questions.append(q) answers.append(a) lastUpdateTime = time.mktime( time.strptime( soup.find( "p", {"lcds-description-list__item-text"}).getText().strip(), "%m/%d/%Y")) converter = Conversion(self._filename, self._path) for question, answer in zip(questions, answers): converter.addExample({ 'sourceUrl': url, 'sourceName': name, "needUpdate": True, "typeOfInfo": "QA", "isAnnotated": False, "responseAuthority": "", "question": question, "answer": answer, "hasAnswer": True, "targetEducationLevel": "NA", "topic": [], "extraData": {}, "targetLocation": 'US', 'language': 'en' }) return converter.write()
def scrape(self): url = 'https://www.avma.org/resources-tools/animal-health-and-welfare/covid-19/covid-19-faqs-pet-owners' html = requests.get(url).text soup = BeautifulSoup(html, 'lxml') faq = soup.find('h3', {'id': '1'}) questions = [] answers = [] begun = False for e in faq.next_siblings: if e.name == 'h5': if begun: questions.append(q) answers.append(a) q = str(e) a = '' begun = True elif e.name == 'p': a += str(e) questions.append(q) answers.append(a) converter = Conversion(self._filename, self._path) for q, a in zip(questions, answers): converter.addExample({ 'sourceUrl': 'https://www.avma.org/sites/default/files/2020-03/covid-19-faq-pet-owners.pdf', 'sourceName': 'AVMA', #No dates exist on the page "needUpdate": True, "containsURLs": False, "typeOfInfo": "QA", "isAnnotated": False, "responseAuthority": "", "question": q, "answer": a, "hasAnswer": True, "targetEducationLevel": "NA", "topic": ['pets', 'animals'], "extraData": {}, "targetLocation": "US", 'language': 'en' }) return converter.write()
def test_blank_answer_exception(self): with self.assertRaises(ValueError) as e: converter = Conversion('test', '.') converter.addExample({ 'sourceUrl': 'example.com', 'sourceName': "example", "needUpdate": True, "typeOfInfo": "QA", "isAnnotated": False, "responseAuthority": "", "question": '<a href="example.com/dir1">What is COVID-19?</a>', "answer": '\n \n', "hasAnswer": True, "targetEducationLevel": "NA", "topic": ['topic1', 'topic2'], "extraData": { 'hello': 'goodbye' }, "targetLocation": "US", "language": 'en', })
def scrape(self): url = "https://www.hopkinsmedicine.org/health/conditions-and-diseases/coronavirus/coronavirus-frequently-asked-questions" html = requests.get(url).text soup = BeautifulSoup(html, 'lxml').find_all('div', {'class': 'rtf'}) lastUpdateTime = time.mktime( dateparser.parse(soup[-1].getText().strip()[7:]).timetuple()) final_questions = [] final_responces = [] for section in soup: questions = section.find_all('h3') for question in questions: final_questions.append(question.get_text(strip=False)) soup_iter = question answer = "" while soup_iter.find_next_sibling( ) and soup_iter.find_next_sibling().name in ['p', 'ul']: soup_iter = soup_iter.find_next_sibling() answer += " " + str(soup_iter) final_responces.append(answer) converter = Conversion(self._filename, self._path) for q, a in zip(final_questions, final_responces): converter.addExample({ 'sourceUrl': url, 'sourceName': "JHU Medicine", "needUpdate": True, "containsURLs": False, "typeOfInfo": "QA", "isAnnotated": False, "responseAuthority": "", "question": q, "answer": a, "hasAnswer": True, "targetEducationLevel": "NA", "topic": [], "extraData": {}, 'targetLocation': '', 'language': 'en' }) return converter.write()
def test_time_consistency(self): subprocess.run( ['touch', './schema_v0.3/test_time_consistency_v0.3.jsonl']) converter = Conversion('test_time_consistency', '.') converter.addExample({ 'sourceUrl': 'time.com', 'sourceName': "time", "needUpdate": True, "typeOfInfo": "QA", "isAnnotated": False, "responseAuthority": "", "question": 'Hello, my time should match my next line?', "answer": 'Hello this is the example responce', "hasAnswer": True, "targetEducationLevel": "NA", "topic": ['topic1', 'topic2'], "extraData": { 'hello': 'goodbye' }, "targetLocation": "US", "language": 'en', }) converter.addExample({ 'sourceUrl': 'time.com', 'sourceName': "uuid", "needUpdate": True, "typeOfInfo": "QA", "isAnnotated": False, "responseAuthority": "", "question": 'Do I match the above line time? Please say yes!', "answer": 'Hello this is the example responce', "hasAnswer": True, "targetEducationLevel": "NA", "topic": ['topic1', 'topic2'], "extraData": { 'hello': 'goodbye' }, "targetLocation": "US", "language": 'en', }) converter.write() with jsonlines.open( './schema_v0.3/test_time_consistency_v0.3.jsonl') as reader: line = reader.read() dateLastChanged_0 = line['dateLastChanged'] line = reader.read() dateLastChanged_1 = line['dateLastChanged'] self.assertEqual(dateLastChanged_0, dateLastChanged_1) subprocess.run( ['rm', './schema_v0.3/test_time_consistency_v0.3.jsonl'])
def scrape(self): name = 'NYTimes' url = 'https://www.nytimes.com/interactive/2020/world/coronavirus-tips-advice.html' html = urlopen(url) soup = BeautifulSoup(html, "lxml") questions, answers = [], [] for panelgroup in soup.findAll("div", {"class": "g-question-wrap"}): q = str(panelgroup.find('h3')) a = str(panelgroup.find('div', {'class': "g-answer-wrap"})) questions.append(q) answers.append(a) lastUpdateTime = time.mktime( time.strptime( soup.find('time').getText(), "Updated %B %d, %Y")) converter = Conversion( self._filename, self._path) for question, answer in zip(questions, answers): converter.addExample({ 'sourceUrl': url, 'sourceName': name, "needUpdate": True, "typeOfInfo": "QA", "isAnnotated": False, "responseAuthority": "", "question": question, "answer": answer, "hasAnswer": True, "targetEducationLevel": "NA", "topic": [], "extraData": {}, "targetLocation": "", "language": 'en' }) return converter.write()
def test_addExample(self): converter = Conversion('test', '.') converter.addExample({ 'sourceUrl': 'example.com', 'sourceName': "example", "needUpdate": True, "typeOfInfo": "QA", "isAnnotated": False, "responseAuthority": "", "question": '<a href="example.com/dir1">What is COVID-19?</a>', "answer": '<p><a href="example.com/dir2">Coronaviruses</a> are a large family of viruses.</p>', "hasAnswer": True, "targetEducationLevel": "NA", "topic": ['topic1', 'topic2'], "extraData": { 'hello': 'goodbye' }, "targetLocation": "US", "language": 'en', }) self.assertEqual(len(converter._examples), 1) self.assertEqual(converter.write(), True)
def scrape(self): url = 'https://www.globalhealthnow.org/2020-02/coronavirus-expert-reality-check' html = requests.get(url).text lastUpdateTime = time.mktime( time.strptime( BeautifulSoup(html, 'lxml').find('div', { 'class': 'article-meta-wrap' }).getText().strip(), '%B %d, %Y')) soup = BeautifulSoup(html, 'lxml').find('div', { 'property': 'schema:text' }).findAll('h3') questions_list = list(filter(self._filter_h3_headers, soup)) questions = [x.getText().strip() for x in questions_list] responces = list(map(self._get_responces, questions_list[:-1])) responces.append(self._get_final_responce(questions_list[-1])) responces = list(map(self._truncate_responce, responces)) topics = list(map(self._get_topic, questions_list)) converter = Conversion(self._filename, self._path) for q, a, t in zip(questions, responces, topics): converter.addExample({ 'sourceUrl': url, 'sourceName': "Johns Hopkins Bloomberg School of Public Health", "needUpdate": True, "typeOfInfo": "QA", "isAnnotated": False, "responseAuthority": "", "question": q, "answer": a, "hasAnswer": True, "targetEducationLevel": "College", "topic": [t], "extraData": {}, "targetLocation": "", 'language': 'en' }) return converter.write()
def scrape(self): url = 'https://www.canada.ca/en/public-health/services/diseases/coronavirus-disease-covid-19.html#faq' html = requests.get(url).text soup = BeautifulSoup(html, 'lxml').find('ul', { 'class': 'list-unstyled' }).findAll('a') lastUpdatedTime = time.mktime( dateparser.parse( BeautifulSoup(html, 'lxml').find('p', { 'class': 'text-right h3 mrgn-tp-sm' }).getText()).timetuple()) questions = [str(x) for x in soup] response_links = [x['href'] for x in soup] responses = list(map(self._link_to_responce, response_links)) converter = Conversion(self._filename, self._path) for q, a in zip(questions, responses): if not a: # no accompanying answer to question continue converter.addExample({ 'sourceUrl': url, 'sourceName': "Public Health Agency of Canada", "needUpdate": True, "typeOfInfo": "QA", "isAnnotated": False, "responseAuthority": "", "question": q, "answer": a if a else "", "hasAnswer": a is not None, "targetEducationLevel": "NA", "topic": [], "extraData": {}, "targetLocation": "Canada", "language": 'en', }) return converter.write()
def scrape(self): converter = Conversion( self._filename, self._path) # Put the code here that makes the for exampleNums in range(10): converter.addExample({ 'sourceUrl': 'example.com', 'sourceName': "example", "needUpdate": True, "typeOfInfo": "QA", "isAnnotated": False, "responseAuthority": "", "question": '<a href="example.com/dir1">What is COVID-19?</a>', "answer": '<p><a href="example.com/dir2">Coronaviruses</a> are a large family of viruses.</p>', "hasAnswer": True, "targetEducationLevel": "NA", "topic": ['topic1', 'topic2'], "extraData": {'hello': 'goodbye'}, "targetLocation": "US", "language": 'en', }) # This write() will fail because the path doesnt exist return converter.write()
def test_init(self): converter = Conversion('test', '.') self.assertEqual(converter._file_prefix, 'test') self.assertEqual(converter._examples, [])
def test_remove_unseen(self): subprocess.run( ['touch', './schema_v0.3/test_remove_unseen_v0.3.jsonl']) converter = Conversion('test_remove_unseen', '.') converter.addExample({ 'sourceUrl': 'time.com', 'sourceName': "time", "needUpdate": True, "typeOfInfo": "QA", "isAnnotated": False, "responseAuthority": "", "question": 'Hello, my time should match my next line?', "answer": 'Hello this is the example responce', "hasAnswer": True, "targetEducationLevel": "NA", "topic": ['topic1', 'topic2'], "extraData": { 'hello': 'goodbye' }, "targetLocation": "US", "language": 'en', }) converter.write() converter = Conversion('test_remove_unseen', '.') converter.addExample({ 'sourceUrl': 'time.com', 'sourceName': "time", "needUpdate": True, "typeOfInfo": "QA", "isAnnotated": False, "responseAuthority": "", "question": 'I am completely new?', "answer": 'I am unique! I am special! I matter!', "hasAnswer": True, "targetEducationLevel": "NA", "topic": ['topic1', 'topic2'], "extraData": { 'hello': 'goodbye' }, "targetLocation": "US", "language": 'en', }) converter.write() with open('./schema_v0.3/test_remove_unseen_v0.3.jsonl') as reader: self.assertEqual(len(reader.readlines()), 1)
def test_id_preservation_fuzzy_change(self): subprocess.run( ['touch', './schema_v0.3/test_id_preservation_v0.3.jsonl']) converter = Conversion('test_id_preservation', '.') converter.addExample({ 'sourceUrl': 'uuid.com', 'sourceName': "uuid", "needUpdate": True, "typeOfInfo": "QA", "isAnnotated": False, "responseAuthority": "", "question": 'Hello, this is the example question?', "answer": 'Hello this is the example responce', "hasAnswer": True, "targetEducationLevel": "NA", "topic": ['topic1', 'topic2'], "extraData": { 'hello': 'goodbye' }, "targetLocation": "US", "language": 'en', }) converter.write() with jsonlines.open( './schema_v0.3/test_id_preservation_v0.3.jsonl') as reader: line = reader.read() id = line['ID'] converter = Conversion('test_id_preservation', '.') converter.addExample({ 'sourceUrl': 'uuid.com', 'sourceName': "uuid", "needUpdate": True, "typeOfInfo": "QA", "isAnnotated": False, "responseAuthority": "", "question": 'Hello, but this is the example question?', "answer": 'Hello this is the example responce', "hasAnswer": True, "targetEducationLevel": "NA", "topic": ['topic1', 'topic2'], "extraData": { 'hello': 'goodbye' }, "targetLocation": "US", "language": 'en', }) converter.write() with jsonlines.open( './schema_v0.3/test_id_preservation_v0.3.jsonl') as reader: line = reader.read() new_id = line['ID'] self.assertEqual(id, new_id) subprocess.run(['rm', './schema_v0.3/test_id_preservation_v0.3.jsonl'])
def scrape(self): examples = self._crawl_common() + self._crawl_at_risk() converter = Conversion(self._filename, self._path) for example in examples: converter.addExample(example) return converter.write()