def parse_page(parser,pagenr): page=parser.parse_page_to_items(pagenr) out=[] for areaname,coords,meta in find_areas(page): print "Found area:",areaname if areaname.count("CTA") or areaname.count("FIR") or areaname.count("REF:ENR 2.2-3") or areaname.count("OCA"): continue assert areaname.count("TMA") lines=[x for x in page.get_lines(page.get_partially_in_rect(0,meta['y2']+0.5,100,meta['y2']+10)) if x.strip()] alts=[] for line in lines[:15]: print "Alt-parsing:",line m=re.match(ur"(FL \d+).*",line) if m: alts.append(m.groups()[0]) m=re.match(ur"(\d+ FT AMSL).*",line) if m: alts.append(m.groups()[0]) if len(alts)==2: break ceiling,floor=alts identh,=page.get_by_regex(ur"IDENT") freqh,=page.get_by_regex(ur"FREQ") callsign= " ".join(page.get_lines(page.get_partially_in_rect(identh.x1,meta['y1']+0.25,freqh.x1-2.0,meta['y2']))) freqlines=" ".join(page.get_lines(page.get_partially_in_rect(freqh.x1,meta['y1'],freqh.x2,meta['y2']))) def wanted_freq(x): if abs(x-121.5)<1e-6: return False if x>150.0: return False return True freqs=[(callsign,float(x)) for x in re.findall(ur"\d{3}\.\d{3}",freqlines) if wanted_freq(float(x))]
def parse_page(parser, pagenr): page = parser.parse_page_to_items(pagenr) out = [] for areaname, coords, meta in find_areas(page): print "Found area:", areaname if areaname.count("CTA") or areaname.count("FIR") or areaname.count( "REF:ENR 2.2-3") or areaname.count("OCA"): continue assert areaname.count("TMA") lines = [ x for x in page.get_lines( page.get_partially_in_rect(0, meta['y2'] + 0.5, 100, meta['y2'] + 10)) if x.strip() ] alts = [] for line in lines[:15]: print "Alt-parsing:", line m = re.match(ur"(FL \d+).*", line) if m: alts.append(m.groups()[0]) m = re.match(ur"(\d+ FT AMSL).*", line) if m: alts.append(m.groups()[0]) if len(alts) == 2: break ceiling, floor = alts identh, = page.get_by_regex(ur"IDENT") freqh, = page.get_by_regex(ur"FREQ") callsign = " ".join( page.get_lines( page.get_partially_in_rect(identh.x1, meta['y1'] + 0.25, freqh.x1 - 2.0, meta['y2']))) freqlines = " ".join( page.get_lines( page.get_partially_in_rect(freqh.x1, meta['y1'], freqh.x2, meta['y2']))) def wanted_freq(x): if abs(x - 121.5) < 1e-6: return False if x > 150.0: return False return True freqs = [(callsign, float(x)) for x in re.findall(ur"\d{3}\.\d{3}", freqlines) if wanted_freq(float(x))]
def extract_single_sup(full_url, sup, supname, opening_ours): #print getxml("/AIP/AD/AD 1/ES_AD_1_1_en.pdf") ads = [] try: p = Parser(sup) except Exception: print "Could't parse", sup #Some AIP SUP's contain invalid XML after conversion from PDF. #skip these for now return [] areas = [] startpage = None for pagenr in xrange(p.get_num_pages()): page = p.parse_page_to_items(pagenr) #print page.get_all_items() for item in page.get_by_regex(".*HOURS OF OPERATION.*"): lines = page.get_lines( page.get_partially_in_rect(0, item.y1 - 2, 100, item.y2 + 2)) found = False for line in lines: if re.match(ur".*SUP\s*\d+/\d{4}\.?\s+HOURS OF OPERATION\s*$", line): opening_ours.add(p.get_url()) print "Found hours:", opening_ours try: for areaname, coords, meta in find_areas(page): if areaname: name = "%s (on page %d of %s)" % (areaname, pagenr + 1, supname) else: name = "Area on page %d of %s" % (pagenr + 1, supname) print "Number of points", len(coords) areas.append( dict(url=full_url, pagenr=pagenr + 1, sup=supname, name=name, type='aip_sup', points=coords)) except Exception: pass
def extract_single_sup(full_url,sup,supname,opening_ours): #print getxml("/AIP/AD/AD 1/ES_AD_1_1_en.pdf") ads=[] try: p=Parser(sup) except Exception: print "Could't parse",sup #Some AIP SUP's contain invalid XML after conversion from PDF. #skip these for now return [] areas=[] startpage=None for pagenr in xrange(p.get_num_pages()): page=p.parse_page_to_items(pagenr) #print page.get_all_items() for item in page.get_by_regex(".*HOURS OF OPERATION.*"): lines=page.get_lines(page.get_partially_in_rect(0,item.y1-2,100,item.y2+2)) found=False for line in lines: if re.match(ur".*SUP\s*\d+/\d{4}\.?\s+HOURS OF OPERATION\s*$",line): opening_ours.add(p.get_url()) print "Found hours:",opening_ours try: for areaname,coords,meta in find_areas(page): if areaname: name="%s (on page %d of %s)"%(areaname,pagenr+1,supname) else: name="Area on page %d of %s"%(pagenr+1,supname) print "Number of points",len(coords) areas.append(dict( url=full_url, pagenr=pagenr+1, sup=supname, name=name, type='aip_sup', points=coords)) except Exception: pass