def query_instrument_fields(args): try: doc_type, search_type, body = common(args) es = current_app.extensions['elasticsearch'] results = es.search( index="legislation", #explain="true", doc_type=doc_type, body=body) """ def get_totals(hit): result = {} for detail in hit['_explanation']['details']: pass return result """ clean_results = results['hits'] # map(get_totals, results['hits']) return { 'type': 'search', 'search_type': search_type, 'search_results': clean_results } except Exception, e: print e raise CustomException('There was a problem with your query')
def get_definition_route(ids, exids=None): try: return jsonify( get_definition(ids.split(';'), exids.split(';') if exids else None)) except Exception, e: raise CustomException('Could not retrieve definition')
def instrument_location(instrument, location, args): def massage(): return nodes_from_path_string(instrument.get_tree(), link_to_canonical(location)) try: tree = nodes_from_path_string(instrument.get_tree(), location) if len(tree) == 1 and tree[0] == instrument.get_tree(): raise CustomException('try again') except CustomException: tree = massage() full_location, _, path = generate_path_string(tree[0]) tree = cull_tree(tree) return { 'html_content': etree.tostring(tohtml(tree), encoding='UTF-8', method="html"), 'title': instrument.title, 'full_title': full_location, 'document_id': instrument.id, 'doc_type': 'instrument', "latest": instrument.attributes['latest'], "path": instrument.attributes['path'], "date_as_at_str": format_govt_date(instrument.attributes['date_as_at']), 'format': 'fragment', 'query': { 'doc_type': 'instrument', 'document_id': instrument.id, 'find': 'location', 'location': path } }
def get_act_summary(doc_id, db=None): with (db or get_db()).cursor(cursor_factory=extras.RealDictCursor) as cur: query = """select * from instruments where id = %(doc_id)s """ cur.execute(query, {'doc_id': doc_id}) try: return cur.fetchone() except: raise CustomException("Instrument not found")
def find_definition(tree, query): try: nodes = tree.xpath(".//def-para/para/text/def-term[contains(.,'%s')]" % query) lev_nodes = sorted(map(lambda x: (x, levenshtein(query, x.text)), nodes), key=itemgetter(1)) return lev_nodes[0][0].iterancestors(tag='def-para').next() except Exception, e: raise CustomException("Path for definition not found")
def find_document_id_by_govt_id(node_id, db=None): with (db or get_db()).cursor() as cur: try: query = """ select a.id from latest_instruments a join id_lookup i on i.parent_id = a.id where i.govt_id = %(node_id)s """ cur.execute(query, {'node_id': node_id}) return cur.fetchone()[0] except Exception, e: raise CustomException("Result not found")
def get_act_summary_govt_id(govt_id, db=None): with (db or get_db()).cursor(cursor_factory=extras.RealDictCursor) as cur: query = """select * from id_lookup d join instruments i on parent_id = id where d.govt_id = %(govt_id)s""" cur.execute(query, {'govt_id': govt_id}) try: return cur.fetchone() except: raise CustomException("Instrument not found")
def get_act_exact(title=None, doc_id=None, db=None): with (db or get_db()).cursor() as cur: query = """ select document, id from latest_instruments where (%(title)s is null or title = %(title)s) and (%(id)s is null or id = %(id)s) """ cur.execute(query, {'title': title, 'id': doc_id}) try: result = cur.fetchone() return etree.fromstring(result[0], parser=large_parser), result[1] except: raise CustomException("Instrument not found")
def query_instrument(args): find = args.get('find') if find == 'contains': return query_contains(args) if find == 'section_references': return section_references(args) if find == 'section_versions': return section_versions(args) if find == 'more': return instrument_more(args.get('document_id'), args.get('parts').split(','), args) govt_location = args.get('govt_location') if args.get('id', args.get('document_id')): doc_id = args.get('id', args.get('document_id')) if isinstance(doc_id, basestring) and not doc_id.isdigit(): govt_id = doc_id doc_id = find_document_id_by_govt_id(doc_id) instrument = get_instrument_object(doc_id) if instrument.attributes['govt_id'] != govt_id: find = 'govt_location' govt_location = govt_id else: instrument = get_instrument_object(doc_id) elif args.get('title'): instrument = get_latest_instrument_object(args.get('title')) else: raise CustomException('No instrument specified') if find == 'preview': return instrument_preview(instrument) elif find == 'location': if args.get('location'): return instrument_location(instrument, args.get('location'), args) elif find == 'govt_location': if not govt_location: raise CustomException('No location specified') return instrument_govt_location(instrument, govt_location, args.get('link_text'), args) """ default is full instrument """ return instrument_full(instrument, args)
def query_contains(args): try: es = current_app.extensions['elasticsearch'] offset = args.get('offset') doc_id = args.get('id', args.get('document_id')) query_filter = {"term": {"_parent": doc_id}} if args.get('parts'): parts = map(lambda x: '%s-%s' % (doc_id, x), args.get('parts').split(',')) query_filter = { 'bool': { "should": query_filter, "must": { "ids": { 'values': parts } } } } body = { "fields": ['title'], "sort": ['num'], "query": contains_query(args, 'html'), "filter": query_filter, "highlight": { "pre_tags": ["<span class='search_match'>"], "post_tags": ["</span>"], "fields": { 'html': { "number_of_fragments": 0 } }, #"phrase_limit" : 1024, #{"fragment_size" : 200, "number_of_fragments" : 100}} } } if args.get('parts'): body['highlight']['require_field_match'] = False body["size"] = 10000 else: body["size"] = 25 body["from"] = offset results = es.search(index="legislation", doc_type='part', body=body) return { 'type': 'search', 'search_type': 'contains_result', 'search_results': results['hits'], 'title': 'Advanced Search' } except Exception, e: print e raise CustomException('There was a problem with your query')
def get_instrument_object(id=None, db=None, replace=False): try: db = db or get_db() with db.cursor(cursor_factory=extras.RealDictCursor) as cur: query = """SELECT * from get_processed_instrument(%(id)s) """ cur.execute(query, {'id': id}) result = cur.fetchone() if result: return prep_instrument(result, replace, db) return prep_instrument(get_unprocessed_instrument(id, db)) except TypeError, e: print e raise CustomException('Document does not exist')
def query_contains_skeleton(args): try: es = current_app.extensions['elasticsearch'] doc_id = args.get('id', args.get('document_id')) results = {} body = { "fields": [], "query": contains_query(args, 'skeleton'), "filter": { "term": { "_id": doc_id } }, "highlight": { "pre_tags": ["<span class='search_match'>"], "post_tags": ["</span>"], "fields": { 'skeleton': { "number_of_fragments": 0 } }, "require_field_match": False } } es_results = es.search(index="legislation", doc_type='instrument', body=body) try: results['html_content'] = es_results['hits']['hits'][0][ 'highlight']['skeleton'][0] except IndexError: pass body = { "fields": [], "size": 10000, "query": contains_query(args, 'html'), "filter": { "term": { "_parent": doc_id } } } es_results = es.search(index="legislation", doc_type='part', body=body) results['part_matches'] = sorted(map( lambda x: x['_id'].split('-', 1)[1], es_results['hits']['hits']), key=lambda x: int(x)) return results except Exception, e: print e raise CustomException('There was a problem with your query')
def query_case_fields(args): must = [] fields = {} try: if args.get('full_citation'): must.append({ "simple_query_string": { "query": args.get('full_citation'), "fields": ['full_citation'], "default_operator": 'AND' } }) if args.get('contains'): fields['document'] = {} must.append(contains_query(args)) if args.get('year'): must.append(year_query(args)) """'neutral_citation', 'courtfile', , 'year', 'court', 'bench', 'parties', 'matter', 'charge']""" es = current_app.extensions['elasticsearch'] offset = args.get('offset', 0) results = es.search(index="legislation", doc_type="case", body={ "from": offset, "size": 25, "fields": ["id", "full_citation"], "sort": [ "_score", ], "query": { "bool": { "must": must } }, "highlight": { "pre_tags": ["<span class='search_match'>"], "post_tags": ["</span>"], "fields": fields } }) return { 'type': 'search', 'search_results': results['hits'], 'title': 'Advanced Search' } except Exception, e: print e raise CustomException('There was a problem with your query')
def query(): args = request.args query_type = args.get('doc_type') if query_type == 'all': result = query_all(args) elif query_type in ['act', 'regulation', 'sop', 'bill', 'instrument']: result = query_instrument(args) elif query_type in ['instruments']: result = query_instrument_fields(args) elif query_type == 'case': result = query_case(args) elif query_type == 'cases': result = query_case_fields(args) else: raise CustomException('Badly formed query') return jsonify(result)
def prep_instrument(result, replace, db): if not result: raise CustomException('Instrument not found') tree = None definitions = None redo_skele = False if replace or not result.get('processed_document'): tree, definitions = process_instrument(row=get_unprocessed_instrument( result.get('id'), db=db), db=db, latest=result.get('latest'), refresh=replace) document = etree.tostring(tree, encoding='UTF-8', method="html") redo_skele = True else: document = result.get('processed_document') if redo_skele or not result.get('skeleton'): skeleton = process_skeleton( result.get('id'), tree if tree is not None else etree.fromstring( document, parser=large_parser), db=db, version=result.get('version')) else: skeleton = result.get('skeleton') if redo_skele or not result.get('heights'): heights = process_heights( result.get('id'), tree if tree is not None else etree.fromstring( document, parser=large_parser), db=db, version=result.get('version')) else: heights = result.get('heights') return Instrument(id=result.get('id'), document=document, skeleton=skeleton, heights=heights, title=result.get('title'), attributes=result)
def prep_case(result, replace, db): if not result or not result.get('id'): raise CustomException('Case not found') if replace or not result.get('processed_document'): tree = process_case(row=result, db=db) else: tree = html.fromstring(result.get('processed_document')) #if not result.get('skeleton'): # skeleton, heights = process_skeleton(result.get('id'), tree, db=db) #else: # skeleton = result.get('skeleton') #if not result.get('contents'): # contents = process_contents(result.get('id'), tree, db=db) #else: # contents = result.get('contents') contents = '' return Case( id=result.get('id'), tree=tree, #skeleton=skeleton, contents=contents, attributes=dict(result))
def query_acts(args): raise CustomException('Not Implemented')
def find_definitions(tree, query): nodes = tree.xpath(".//def-para[descendant::def-term[contains(.,'%s')]]" % query) if not len(nodes): raise CustomException("Path for definition not found") return nodes
def case_skeleton_response(case): return { 'html_content': etree.tostring(case.tree, encoding='UTF-8', method="html"), 'html_contents_page': case.contents, 'title': case.title, 'full_title': case.title, 'document_id': case.id, 'doc_type': 'case', 'attributes': case.attributes, #'format': 'skeleton', 'format': 'full', 'parts': {}, 'query': { 'doc_type': 'case', 'document_id': case.id, 'find': 'full' } } def query_case(args): find = args.get('find') if args.get('id'): case = get_case_object(document_id=args.get('id')) return case_skeleton_response(case) raise CustomException('Invalid search type')
def get_link_route(doc_type=None, key=None): if doc_type is None or doc_type == 'instrument': return jsonify(query_instrument({'find': 'preview', 'id': key})) else: raise CustomException("Can't locate link information")
def find_node_by_query(tree, query): try: return tree.xpath(".//*[contains(.,'%s')]" % query) except Exception, e: raise CustomException("Path not found")
def find_sub_node(tree, keys, limit_tags=['part', 'subpart'], force_tag=None): """depth first down the tree matching labels in keys""" """ limit tags exists to prevent ambiguity between parts and section labels. however, sometimes we must treat parts etc like sections, for ranges etc """ node = tree xpath_query = ".//%s[%s]" depth = lambda x: len(list(x.iterancestors())) shallowest = lambda nodes: nodes[0] if len(node) == 1 else sorted( map(lambda x: (x, depth(x)), nodes), key=itemgetter(1))[0][0] def get_closest(node, label): """ note: this is split between xpath and python for performance reasons (xpath too slow on ancestors) """ while True: try: tag = force_tag if force_tag else '*' nodes = node.xpath(xpath_query % (tag, labelize(label))) nodes = filter( lambda x: x.tag not in limit_tags and not len( set(map(lambda t: t.tag, x.iterancestors())). intersection(IGNORE_TRAVERSAL_TAGS)), nodes) return shallowest(nodes) except IndexError: node = node.getparent() if node is None or not len(node): raise StopIteration('no more parents') nodes = [] try: for i, a in enumerate(keys): if a: adds = a.split('+') for add in adds: add = add.strip() if not add: continue elif '-' in add: # we can't assume any reasonable lexicographical ordering of labels, so instead # find first match and continue until last labels = [x.strip() for x in add.split('-')] # get first node start = get_closest(node, labels[0]) last = get_closest(node, labels[1]) # this sucks, having to start at start, tag = start.tag nodes.append(start) # try to find way to start iter at arbitrary node tree_iter = tree.iter(tag) current = None while True: current = next(tree_iter) if current == start: break while True: current = next(tree_iter) nodes.append(current) if current == last: break # find every tag that matches depth, until we match last else: nodes.append(get_closest(node, add.strip())) node = nodes if i < len(keys) - 1: node = nodes[-1] if not len(keys): nodes = [node] if not len(nodes): raise CustomException("Empty") # remove ancestors ancestors = [] for n in nodes: ancestors.extend(list(n.iterancestors())) nodes = [n for n in nodes if n not in ancestors] return nodes except (IndexError, StopIteration, AttributeError), e: raise CustomException("Path not found")
def find_node_by_govt_id(tree, query): try: return tree.xpath(".//*[@id='%s']" % query) except Exception, e: raise CustomException("Path not found")
def nodes_from_path_string(tree, path): path = path.lower().strip() pattern = re.compile( '(schedule|section|regulation|clause|rule|subpart|part|sch|pt|reg|rr|hcr|ss|s|r|cl)[, ]{,2}(.*)' ) part_pattern = re.compile( '([a-z\d]+),? ?(subpart|regulation|clause|rule|section|part|pt|reg|rr|hcr|ss|s|cl|r)?' ) range_pattern = re.compile('^\w+[-+]\w+$') parts = pattern.match(path) keys = [] try: if parts: parts = parts.groups() if any([parts[0].startswith(k) for k in tag_names.keys()]): # match up 'cl ' or 's ', 'section ' or 'clause ' then until '(' label = '' remainder = parts[1].strip() if remainder: # total hack job, but must support subpart/part/schedule ranges if range_pattern.match(remainder): return find_sub_node(tree, [remainder], limit_tags=[], force_tag=tag_names[parts[0]]) sub = part_pattern.match(remainder).groups() # sub[0] is the sch/part label label = sub[0] remainder = remainder[len(label):].replace(',', '').strip() try: tree = tree.xpath( ".//%s[%s]" % (tag_names[parts[0]], labelize(label)))[0] except IndexError: try: # try fake Part tree = tree.xpath(".//head1[%s]" % labelize('part %s' % label))[0] except IndexError: # try empty label, ie unnumbered schedule tree = tree.xpath(".//%s" % (tag_names[parts[0]]))[0] remainder = path[len(parts[0]):] remainder = re.sub('^ ?1?,?', '', remainder) return nodes_from_path_string(tree, remainder) else: if isinstance(tree, etree._ElementTree) or tree.getroottree( ).getroot() == tree: tree = tree.findall(".//body")[0] if parts[1]: _keys = map( lambda x: x.strip(), filter(lambda x: len(x), re.split('[^.a-zA-Z\d\+\- ]+', parts[1]))) # join on + - i = 0 keys = [] while i < len(_keys): if i and _keys[i] in ['+', '-']: keys[-1] += _keys[i] + _keys[i + 1] i += 2 else: keys.append(_keys[i]) i += 1 except IndexError, e: raise CustomException("Path not found")