def value_for_label(html, label, text=True): [title] = select(html, 'td.title:contains("%s")' % label) [content] = select(parent(title, 'tr'), 'td.content') if text: return content.text_content() else: return content
def test_search_lot_deliveries_filter_by_lot(self): self.new_parcel(stage='l-fih', delivery_type=LOT) resp = self.client.get('/search/lot?lot=lot3', follow_redirects=True) self.assertEqual(200, resp.status_code) self.assertEqual(1, len(select(resp.data, '.datatable tbody tr'))) resp = self.client.get('/search/lot?lot=lot2', follow_redirects=True) self.assertEqual(200, resp.status_code) self.assertEqual(0, len(select(resp.data, '.datatable tbody tr')))
def test_lot_page_displays_related_deliveries(self): self.new_parcel(delivery_type=LOT) response = self.client.get('/lot/' + self.LOT_METADATA['lot']) self.assertEqual(response.status_code, 200) rows = select(response.data, "tbody > tr") self.assertEqual(1, len(rows)) response = self.client.get('/lot/lot1') rows = select(response.data, "tbody > tr") self.assertEqual(0, len(rows))
def test_country_workflow_overview_group_contain_correct_parcels(self): data = dict(self.PARCEL_METADATA) self.client.post('/parcel/new/country', data=data) self.client.post('/parcel/new/country', data=data) resp = self.client.get('/country/be') table = select(resp.data, '.datatable') self.assertEqual(1, len(table)) rows = select(resp.data, '.datatable tbody tr') self.assertEqual(2, len(rows))
def test_send_email_page(self): view_resp = self.client.get('/meeting/1/participant/1') [link] = select(view_resp.data, 'a:contains("Acknowledge email")') email_url = '/meeting/1/participant/1/send_mail' self.assertEqual(link.attrib['href'], email_url) email_resp = self.client.get(email_url) self.assertEqual(email_resp.status_code, 200) [to_input] = select(email_resp.data, 'input[name=to]') self.assertEqual(to_input.attrib['value'], "*****@*****.**")
def test_list_of_participants_columns(self): self._create_participant(u"10", default_data={ "meeting_flags_credentials": True, "meeting_flags_approval": True, "meeting_flags_web_alert": True, }) resp = self.client.get("/meeting/1/printouts/verified/short_list") self.assertTrue(select(resp.data, "table .printout-credentials .icon-check")) self.assertTrue(select(resp.data, "table .printout-approval .icon-check")) self.assertTrue(select(resp.data, "table .printout-webalert .icon-check"))
def test_list_for_verification(self): self._create_participant("42", {"meeting_flags_attended": True}) resp = self.client.get("/meeting/1/printouts/attended/list_for_verification") [group] = select(resp.data, ".group") group = group.text_content() self.assertEqual(group, u"International Environmental Law Project") [name] = select(resp.data, ".name") name = name.text_content() self.assertIn(u"smith joe", name.lower())
def test_common_fields(self): self._create_participant(u"10") # 10: "Member" resp = self.client.get('/meeting/1/participant/1/credentials') self.assertIn(u"Joe Smith", value_for_label(resp.data, "Name and address")) self.assertIn(u"French", value_for_label(resp.data, "Language")) self.assertIn(u"Not required", value_for_label(resp.data, "Invitation received")) self.assertIn(u"No", value_for_label(resp.data, "Web Alerts")) [credentials_content] = select(resp.data, ".credentials-content") # check to see if picture alert is present self.assertTrue(select(credentials_content, ".alert")) # check to see if phrases credentials is on page self.assertTrue(select(credentials_content, ".phrases-credentials"))
def test_badge(self): self._create_participant(u"10") resp = self.client.get("/meeting/1/participant/1/badge") self.assertTrue(select(resp.data, ".badge-blue-stripe")) [person_name] = select(resp.data, ".person-name") person_name = person_name.text_content() self.assertIn(u"joe", person_name.lower()) self.assertIn(u"smith", person_name.lower()) [representative] = select(resp.data, ".person-representing") representative = representative.text_content() self.assertIn(u"Europe", representative)
def test_qty(self): self._create_participant(u"10") self._create_participant(u"10") resp = self.client.get("/meeting/1/printouts/verified/pigeon_holes") [qty] = select(resp.data, ".qty") self.assertEqual(qty.text_content(), "2E")
def test_filter_parcel_empty(self): with self.app.test_request_context(): parcel1 = self.wh.new_parcel() parcel1.metadata['country'] = 'ro' resp = self.client.get('/search/country?country=ro&extent=partial') data = select(resp.data, ".datatable tbody tr") self.assertEqual(0, len(data))
def test_filter_parcel_empty(self): with self.app.test_request_context(): parcel1 = self.wh.new_parcel() parcel1.metadata['country'] = 'ro' resp = self.client.get('/search?country=ro&extent=partial') data = select(resp.data, ".datatable tbody tr") self.assertEqual(0, len(data))
def test_reports_are_listed_on_lot_page(self): data = dict(self.REPORT_METADATA, file=(StringIO('ze file'), 'doc.pdf')) self.client.post('/report/new', data=data) data = dict(self.REPORT_METADATA, file=(StringIO('ze file'), 'doc.pdf')) self.client.post('/report/new', data=data) resp = self.client.get('/lot/lot1') reports_number = len(select(resp.data, '.report-list > li')) self.assertEqual(2, reports_number)
def test_conference_staff(self): self._create_participant(u"98") # 98: "Conference staff" resp = self.client.get('/meeting/1/participant/1/credentials') self.assertIn(u"Conference staff", value_for_label(resp.data, "Category")) [details_of_registration] = select(resp.data, ".subheader h3") details_of_registration = details_of_registration.text_content() self.assertIn(u"Observer", details_of_registration) self.assertIn(schema.category["98"]["name"], value_for_label(resp.data, "Representative of"))
def test_observer_international(self): self._create_participant(u"80") # 80: "Observer, International NGO" resp = self.client.get('/meeting/1/participant/1/credentials') self.assertIn(u"Observer, International NGO", value_for_label(resp.data, "Category")) [details_of_registration] = select(resp.data, ".subheader h3") details_of_registration = details_of_registration.text_content() self.assertIn(u"Observer", details_of_registration) self.assertIn(u"International Environmental Law Project", value_for_label(resp.data, "Representative of")) self.assertIn(u"Yes", value_for_label(resp.data, "Invitation received")) # check to see if phrases.fee and phrases.payment are present [credentials_content] = select(resp.data, ".credentials-content") [phrases_fee] = select(resp.data, ".phrases-fee") [phrases_fee] = select(resp.data, ".phrases-payment") [phrases_approval] = select(resp.data, ".phrases-approval")
def test_member(self): self._create_participant(u"10") # 10: "Member" resp = self.client.get('/meeting/1/participant/1/credentials') self.assertIn(u"Member", value_for_label(resp.data, "Category")) [details_of_registration] = select(resp.data, ".subheader h3") details_of_registration = details_of_registration.text_content() self.assertIn(u"Member", details_of_registration) self.assertIn("%s - %s" % (schema.region["4"], schema.country["RO"]), value_for_label(resp.data, "Representative of")) self.assertIn(u"Not required", value_for_label(resp.data, "Invitation received"))
def test_new_participant_submit(self): form_resp = self.client.post('/meeting/1/participant/new', data={ 'personal_name_title': u"Mr", 'personal_first_name': u"Joe", 'personal_last_name': u"Smith", }) url_path = urlparse(form_resp.location).path self.assertEqual(url_path, '/meeting/1/participant/1') view_resp = self.client.get(url_path) self.assertIn("Person information saved", view_resp.data) [first_name_th] = select(view_resp.data, 'tr th:contains("First name")') self.assertElementIn('td:contains("Joe")', first_name_th.getparent())
def test_country_workflow_overview_group(self): data = dict(self.PARCEL_METADATA) self.client.post('/parcel/new/country', data=data) data['product'] = 'grl' self.client.post('/parcel/new/country', data=data) resp = self.client.get('/country/be') table_headers = select(resp.data, '.title') self.assertEqual(1, len(table_headers)) table_headers_text = [t.text for t in table_headers] self.assertIn('Grassland', map(string.strip, table_headers_text))
def test_country_workflow_overview_group(self): data = dict(self.PARCEL_METADATA) self.client.post('/parcel/new', data=data) data['extent'] = 'partial' data['coverage'] = 'Test coverage' self.client.post('/parcel/new', data=data) resp = self.client.get('/country/be') table_headers = select(resp.data, '.title') self.assertEqual(2, len(table_headers)) table_headers_text = [''.join(t.text.split()) for t in table_headers] self.assertIn('Belgium/European/20m/Full', table_headers_text) self.assertIn('Belgium/European/20m/Partial', table_headers_text)
def test_observer_party(self): self._create_participant(u"30") # 30: "Observer, Party" resp = self.client.get('/meeting/1/participant/1/credentials') self.assertIn(u"Observer, Party", value_for_label(resp.data, "Category")) [details_of_registration] = select(resp.data, ".subheader h3") details_of_registration = details_of_registration.text_content() self.assertIn(u"Observer, Party", details_of_registration) self.assertIn(schema.country["RO"], value_for_label(resp.data, "Representative of")) self.assertIn(u"Not required", value_for_label(resp.data, "Invitation received"))
def test_country_workflow_overview_group_contain_correct_parcels(self): data = dict(self.PARCEL_METADATA) self.client.post('/parcel/new', data=data) data['extent'] = 'partial' data['theme'] = 'grd' data['coverage'] = 'Test coverage' self.client.post('/parcel/new', data=data) resp = self.client.get('/country/be') themes = select(resp.data, '.scope-row') self.assertEqual(2, len(themes)) themes_text = [t.text.strip() for t in themes] self.assertIn('Grassland Cover', ''.join(themes_text)) self.assertIn('Grassland Density', ''.join(themes_text))
def test_filter_parcel(self): now = datetime.utcnow() with self.app.test_request_context(): parcel1 = self.wh.new_parcel() parcel1.add_history_item('create', now, 'tester', '') parcel2 = self.wh.new_parcel() parcel2.add_history_item('create', now, 'tester', '') parcel1.metadata['country'] = 'ro' parcel1.metadata['extent'] = 'partial' parcel2.metadata['country'] = 'at' resp = self.client.get('/search?country=ro&extent=partial') rows = select(resp.data, ".datatable tbody tr") self.assertEqual(1, len(rows))
def test_filter_parcel(self): now = datetime.utcnow() with self.app.test_request_context(): parcel1 = self.wh.new_parcel() parcel1.add_history_item('create', now, 'tester', '') parcel2 = self.wh.new_parcel() parcel2.add_history_item('create', now, 'tester', '') parcel1.metadata['country'] = 'ro' parcel1.metadata['extent'] = 'partial' parcel2.metadata['country'] = 'at' resp = self.client.get('/search/country?country=ro&extent=partial') rows = select(resp.data, ".datatable tbody tr") self.assertEqual(1, len(rows))
def main(blog_name): sql_name = 'cnblog_' + blog_name page = 1 flag = True url_0 = "http://www.cnblogs.com/{}/".format(blog_name) url_1 = "http://www.cnblogs.com/{}/".format(blog_name) while flag: print url_1 try: bb = common.get_request(url_1) logging.info('return url {} success '.format(bb.url)) print bb.url soup_2 = BeautifulSoup(bb.text, 'html.parser') with open('asdf.html', 'w+') as f: f.write(bb.text.encode('utf8')) b2 = soup_2.find_all( 'a', {'id': re.compile('homepage1_\S+_TitleUrl_\S+?')}) # 某页的文章链接 for i_text in b2: article_url = i_text.get('href') print article_url logging.info('article is {}'.format(article_url)) article_title = i_text.get_text().strip() if not common.select(article_url, blog_name): article = common.get_request(article_url) pub_time = common.re_time(article.text) keyword, content = extract(article.text) blog_id, blog_app, post_id = blog_info(article.text) keyword = kword(blog_id, blog_app, post_id) common.sql_insert(sql_name, article_url, article_title, content, pub_time, keyword) common.rand_sleep(6, 1) page += 1 re_str = url_0 + r'default\S+page={}'.format(page) print re_str pp = re.compile(re_str) ppp = re.search(pp, bb.text) if ppp is None: flag = False else: url_1 = ppp.group() common.rand_sleep(7, 1) except Exception, e: print Exception, e logging.error('run error', exc_info=True)
def main(): source = 'mux' page = 1 flag = True url = 'http://mux.baidu.com/?page_id=10&paged={}'.format(page) while flag: try: print url res = common.get_request(url) logging.info('return url {} success'.format(res.url)) print res.url soup = BeautifulSoup(res.text, 'html.parser') with open('temp.html', 'w+') as f: f.write(res.text.encode('utf8')) articles = soup.find_all('div', class_='artical_inner') for item in articles: contents = item.contents article_url = contents[9].a.get('href') article_title = str(contents[3].a.get('title')).strip() if not common.select(article_url, source): pub_time = time.strftime('%Y-%m-%d',\ time.strptime(str(contents[5].get_text()).split('|')[-1].strip(), '%Y年%m月%d日')) keyword = str( contents[5].get_text()).split('|')[-2].strip() content = get_content( common.get_request(article_url).text) print article_title common.sql_insert(source, article_url, article_title, content, pub_time, keyword) common.rand_sleep(6, 1) page += 1 re_str = r'http://mux.baidu.com/\?page_id=10\S+paged={}'.format( page) pat = re.compile(re_str) s_r = re.search(pat, res.text) if s_r is None: flag = False else: url = 'http://mux.baidu.com/?page_id=10&paged={}'.format(page) common.rand_sleep(7, 1) except Exception, e: print Exception, e logging.error('run error', exc_info=True)
def main(blog_name): sql_name = 'cnblog_' + blog_name page = 1 flag = True url_0 = "http://www.cnblogs.com/{}/".format(blog_name) url_1 = "http://www.cnblogs.com/{}/".format(blog_name) while flag: print url_1 try: bb = common.get_request(url_1) logging.info('return url {} success '.format(bb.url)) print bb.url soup_2 = BeautifulSoup(bb.text, 'html.parser') with open('asdf.html', 'w+') as f: f.write(bb.text.encode('utf8')) b2 = soup_2.find_all('a', {'id': re.compile('homepage1_\S+_TitleUrl_\S+?')}) # 某页的文章链接 for i_text in b2: article_url = i_text.get('href') print article_url logging.info('article is {}'.format(article_url)) article_title = i_text.get_text().strip() if not common.select(article_url, blog_name): article = common.get_request(article_url) pub_time = common.re_time(article.text) keyword, content = extract(article.text) blog_id, blog_app, post_id = blog_info(article.text) keyword = kword(blog_id, blog_app, post_id) common.sql_insert(sql_name, article_url, article_title, content, pub_time, keyword) common.rand_sleep(6, 1) page += 1 re_str = url_0 + r'default\S+page={}'.format(page) print re_str pp = re.compile(re_str) ppp = re.search(pp, bb.text) if ppp is None: flag = False else: url_1 = ppp.group() common.rand_sleep(7, 1) except Exception, e: print Exception, e logging.error('run error', exc_info=True)
def test_list_of_participants(self): self._create_participant(u"10") self._create_participant(u"1") resp = self.client.get("/meeting/1/printouts/verified/short_list") # conditie: Verif and Cat>9 and Cat<98 and Cat["registered"] is Ture with self.app.test_request_context(): person_row = database.get_person_or_404(1) category = schema.category[person_row["personal_category"]] self.assertTrue(category["registered"]) with self.app.test_request_context(): person_row = database.get_person_or_404(2) category = schema.category[person_row["personal_category"]] self.assertFalse(category["registered"]) [representing] = select(resp.data, "table .printout-representing") representing = representing.text_content() self.assertIn(u"Europe", representing) self.assertIn(u"Romania", representing)
def input_data(root_path): for fp in com.all_files(root_path, EXCL_LIST): cur = com.file_stat(fp); db = com.select(fp); if cur['size'] < 100000000: # 100MB continue if db == None: h = com.hash(fp) if h != None : com.input(cur['path'], cur['size'], cur['date'], com.hash(fp)) print('등록',fp) continue # DB에 이미 등록되어 있고, 크기 및 날짜가 같다면 다음 if com.is_same_stat(cur, db) : print('이미 등록',fp) continue print('갱신',fp) com.update(cur['path'], cur['size'], cur['date'], com.hash(fp))
def test_random_user_allowed_to_view_report(self): self.add_to_role("somebody", "ROLE_SP") self.try_new_report() self.app.config["ROLE_SP"] = [] resp = self.client.get("/lot/lot1") self.assertEqual(1, len(select(resp.data, ".report-list")))
def test_delete_parcel_link_if_allow_parcel_deletion(self): parcel_name = self.create_parcel_at_stage('ver') resp = self.client.get('/parcel/%s' % parcel_name) self.assertEqual(1, len(select(resp.data, '.delete-parcel')))
def test_begin_parcel_displays_all_deliveries(self): resp = self.client.get('/parcel/new/country') delivery_types = select(resp.data, '[name=delivery_type]') self.assertEqual(len(delivery_types), 3)
def test_delete_parcel_link_if_not_allow_parcel_deletion(self): self.app.config['ALLOW_PARCEL_DELETION'] = False parcel_name = self.create_parcel_at_stage('ver') resp = self.client.get('/parcel/%s' % parcel_name) self.assertEqual(0, len(select(resp.data, '.delete-parcel')))
def test_search_country_deliveries(self): self.new_parcel(stage='l-fih', delivery_type=COUNTRY) resp = self.client.get('/search/country') self.assertEqual(200, resp.status_code) self.assertEqual(1, len(select(resp.data, '.datatable tbody tr')))
if __name__ == '__main__': argv = common.parse_flags() ffilename = FLAGS.parallel_corpus[0] efilename = FLAGS.parallel_corpus[1] afilename = FLAGS.parallel_corpus[2] ffile = open(ffilename) efile = open(efilename) afile = open(afilename) alignments = alignment.Alignment.reader_pharaoh(ffile, efile, afile) hgs = [] rule_dumper = RuleDumper() for i, a in enumerate(timed(select(alignments)), 1): a.write_visual(logger.file) #if i != 8: # continue #logger.writeln('--- %s ---' % i) #a.write_visual(logger.file) hg, a = phrase_decomposition_forest(a) hgs.append(hg) for node in hg.topo_order(): for edge in node.incoming: edge.rule = make_rule( [edge.head.fi, edge.head.fj, edge.head.ei, edge.head.ej], [[x.fi, x.fj, x.ei, x.ej] for x in edge.tail], a.fwords, a.ewords) #hg.show()
def main(): import gc gc.set_threshold(100000, 10, 10) # this makes a huge speed difference #gc.set_debug(gc.DEBUG_STATS) input_file = open(FLAGS.parallel_corpus[2]) if FLAGS.hypergraph is not None: try: os.mkdir(FLAGS.hypergraph) except OSError: sys.stderr.write("warning: directory %s already exists\n" % FLAGS.hypergraph) ffilename = FLAGS.parallel_corpus[0] efilename = FLAGS.parallel_corpus[1] ffile = open(ffilename) efile = open(efilename) if FLAGS.weightfiles is not None: fweightfile, eweightfile = FLAGS.weightfiles else: fweightfile = None eweightfile = None lexical_weighter = LexicalWeighter(fweightfile, eweightfile) maxlen = FLAGS.maxlen maxabslen = FLAGS.maxabslen tight_phrases = FLAGS.tight prev_time = start_time = time.time() slice = 1000 if profile: prof = hotshot.Profile("extractor.prof") prof.start() if logger.level >= 1: sys.stderr.write("(2) Extracting rules\n") count = 1 realcount = 0 slice = 1000 if FLAGS.pharaoh: alignments = alignment.Alignment.reader_pharaoh( ffile, efile, input_file) else: alignments = alignment.Alignment.reader(input_file) # bug: ignores -W option rule_dumper = RuleDumper() for i, a in enumerate(select(alignments), 1): a.lineno = count if logger.level >= 2: a.write(logger.file) a.write_visual(logger.file) etree = None # done reading all input lines realcount += 1 extractor = Extractor(maxabslen, maxlen, FLAGS.minhole, FLAGS.maxvars, FLAGS.forbid_adjacent, FLAGS.require_aligned_terminal, tight_phrases, FLAGS.remove_overlaps, lexical_weighter, FLAGS.keep_word_alignments, etree, FLAGS.etree_labels) rules = extractor.extract_rules(a) if logger.level >= 3: sys.stderr.write("Rules:\n") rules = list(rules) for r in rules: sys.stderr.write("%d ||| %s\n" % (realcount, r)) if False: rules = list(rules) for r in rules: sys.stderr.write("%d ||| %s ||| %f %f\n" % (realcount - 1, r, r.scores[1] / r.scores[0], r.scores[2] / r.scores[0])) #logger.writeln('%s rules extracted from sent %s' % (len(rules), i)) rule_dumper.add(rules) if logger.level >= 1 and count % slice == 0: sys.stderr.write("time: %f, sentences in: %d (%.1f/sec), " % (time.time() - start_time, count, slice / (time.time() - prev_time))) sys.stderr.write("rules out: %d+%d\n" % (rule_dumper.dumped, len(rule_dumper.gram))) prev_time = time.time() count += 1 rule_dumper.dump() if profile: prof.stop() prof.close() stats = hotshot.stats.load("extractor.prof") stats.strip_dirs() stats.sort_stats('time', 'calls') stats.print_stats(100)
def test_delete_parcel_link_if_not_allow_parcel_deletion(self): self.app.config['ALLOW_PARCEL_DELETION'] = False parcel_name = self.new_parcel(stage='c-fsc') resp = self.client.get('/parcel/%s' % parcel_name) self.assertEqual(0, len(select(resp.data, '.delete-parcel')))
def test_files_view(self): parcel = self.create_parcel_at_stage() self.try_upload(parcel.name) resp = self.client.get('/parcel/%s/files' % parcel.name) self.assertEqual(1, len(select(resp.data, 'ul li')))
def test_delete_parcel_link_if_allow_parcel_deletion(self): parcel_name = self.new_parcel(stage='c-fsc') resp = self.client.get('/parcel/%s' % parcel_name) self.assertEqual(1, len(select(resp.data, '.delete-parcel')))
def test_reupload_file_not_allowed(self): parcel_name = self.new_parcel() self.try_upload_file(parcel_name) resp = self.try_upload_file(parcel_name) self.assertEqual(1, len(select(resp.data, '.system-msg')))
def test_files_view(self): parcel_name = self.new_parcel() self.try_upload(parcel_name) resp = self.client.get('/parcel/%s/files' % parcel_name) self.assertEqual(1, len(select(resp.data, 'ul li')))
def test_lot_page_displays_correct_number_of_stages(self): self.new_parcel(delivery_type=LOT) response = self.client.get('/lot/' + self.LOT_METADATA['lot']) column_titles = select(response.data, "thead > tr:last-child > th") self.assertEqual(6, len(column_titles))
def test_reupload_file_not_allowed(self): parcel = self.create_parcel_at_stage() self.try_upload_file(parcel.name) resp = self.try_upload_file(parcel.name) self.assertEqual(1, len(select(resp.data, '.system-msg')))