def expand_doc_node(self, doc_id): doc = jsearch.get_record(doc_id) for k in doc['children']: self.graph['nodes'].add(doc_name(k['id'])) self.graph['edges'].add( (doc_name(k['id']), doc_name(doc_id), 'is_part_of')) cites = doc['cited_associations'] for a in cites['act_ids']: self.graph['nodes'].add(node_name(a, 'acts')) self.graph['edges'].add( (doc_name(doc_id), node_name(a, 'acts'), 'references')) for r in cites['named_regulation_ids']: self.graph['nodes'].add(node_name(r, 'named_regulations')) self.graph['edges'].add( (doc_name(doc_id), node_name(r, 'named_regulations'), 'references')) incoming = doc['incoming_citation_ids'] if not incoming: return None for d in incoming[0:25]: self.graph['nodes'].add(doc_name(d)) self.graph['edges'].add( (doc_name(d), doc_name(doc_id), 'is_cited_by'))
def get_document_by_id(doc_id, params): try: decorate_children = params.get('decorate_children', False) skip_unused_fields = params.get('skip_unused_fields', False) skip_fields_for_state_code = params.get('skip_fields_for_state_code', False) skip_fields_for_right_panel = params.get('skip_fields_for_right_panel', False) # skip the unused fields if the request told us to do so es_params = {} if skip_unused_fields: if skip_fields_for_state_code: fields_to_skip = UNUSED_FIELDS_FOR_STATE_CODE elif skip_fields_for_right_panel: fields_to_skip = UNUSED_FIELDS_FOR_RIGHT_PANEL else: fields_to_skip = UNUSED_FIELDS_BY_ALL # n.b. yes this needs to be a comma separated string for some reason es_params["_source_exclude"] = ",".join(fields_to_skip) doc = jsearch.get_record(doc_id, params=es_params) docs = [doc] # hack to re-use same methods as get all documents code decorated_docs = decorate_documents(docs, g.user_id, decorate_children=decorate_children) # TODO: Allow turning this off to return all topics with their probabilities by an optional flag topic_filtered_docs = apply_threshold_topics_in_document_response(decorated_docs) return jsonify({'document': topic_filtered_docs[0]}) except elasticsearch.NotFoundError: return jsonify({"errors": "Not found"}), 404
def set_seed_docids(self): data = jsearch.get_record(self.act_id, 'acts') if not data['doc_ids']: print "Seed Neighborhood not found for {} ...".format( self.root_key) return False self.seed_docids = data['doc_ids']
def pop_topic_judgment(): while True: topic_judgment = get_topic_judgment_for_user() try: if topic_judgment: doc = jsearch.get_record(topic_judgment.doc_id) break except NotFoundError: topic_judgment.status = 'bad_doc' db_session_users.add(topic_judgment) db_session_users.commit() if topic_judgment: topic_judgment.status = 'assigned' topic_judgment.user_id = g.user_id db_session_users.add(topic_judgment) db_session_users.commit() else: return {'queue': 'empty'} user = db_session_users.query(User).filter_by(id=g.user_id).first() return { 'id': topic_judgment.id, 'status': topic_judgment.status, 'judgment': topic_judgment.judgment, 'document': doc, 'user': user.to_dict(), 'topic_name': topic_judgment.topic_name }
def document_timelines(doc_id, params, user_id): # n.b. only need the dockets themselves for this query es_params = {"_source_include": "dockets"} doc = jsearch.get_record(doc_id, params=es_params) docket_ids = [d["docket_id"] for d in doc["dockets"]] timelines = {} for dok_id in docket_ids: timelines[dok_id] = docket_timeline(dok_id, params, user_id) return timelines
def get_annotation_job_by_id(annotation_task_id, annotation_job_id, user_id): # n.b. user_id is redundant but this should prevent shenanigans here annotation_job = db_session_users.query(AnnotationJob).\ filter_by(id=annotation_job_id, user_id=user_id).first() annotation_job_dict = annotation_job.to_dict() # n.b. i deliberately left the user_id restriction here in case future QA tasks might allow super annotators # to edit user annotations topic_annotations = db_session_users.query(TopicAnnotation).filter_by(annotation_job_id=annotation_job_id) annotation_job_dict['topic_annotations'] = [t.to_dict() for t in topic_annotations] doc_dict = jsearch.get_record(annotation_job.doc_id) return {'annotation_job': annotation_job_dict, 'document': doc_dict}
def find_or_return_new_search_query(search_args, save_and_refresh_if_new=False): search_args = sanitize_search_args(search_args) search_args_hash = hashlib.sha1(json.dumps(search_args)).hexdigest() search_entry = db_session_users.query(SearchQuery).filter_by( search_args_hash=search_args_hash).first() if search_entry is None: # for proposed filters, this is false, as we've curated it, but if there is no filter, and this is a query # search, then we mark it as true is_arbitrary_query = False # For the very first time only, we need to figure out the display name - and avoid extra API queries # per update or per fetch of the top 5 options display_name = None if "agency_id" in search_args: agency = jsearch.get_record(search_args['agency_id'], "agencies") display_name = agency["name"] elif "act_id" in search_args: act = jsearch.get_record(search_args['act_id'], "acts") display_name = act["name"] elif "regulation_id" in search_args: reg = jsearch.get_record(search_args['regulation_id'], "named_regulations") display_name = reg["name"] elif "citation_id" in search_args: cite = jsearch.get_record(search_args['citation_id'], "citations") display_name = cite["name"] elif "concept_id" in search_args: concept = jsearch.get_record(search_args['concept_id'], "concepts") display_name = concept["name"] elif "bank_id" in search_args: bank = jsearch.get_record(search_args['bank_id'], "banks") display_name = bank["name"] elif "topic_id" in search_args: topic = jsearch.get_record(search_args['topic_id'], "topics") display_name = topic["name"] elif "query" in search_args: is_arbitrary_query = True # TODO should we have a rollback case for possible race conditions on the create call search_entry = SearchQuery(search_args=search_args, display_name=display_name, is_arbitrary_query=is_arbitrary_query) # in the new case, when this flag is set, save and refresh the value if save_and_refresh_if_new: db_session_users.add(search_entry) db_session_users.commit() db_session_users.refresh(search_entry) return search_entry
def test_cover_page(self): es_doc = jsearch.get_record(1) # XXX real id doc = Document(es_doc) cover_file = open('/tmp/cover.pdf', 'w') create_cover_page( title="Check out this cool document", table_contents=table_contents, text_para=summary, file_obj=cover_file ) cover_file.close()
def get_agency_info_by_id(agency_id): if agency_id is None: return jsonify({"errors": "No agency_id param"}), 400 ret_agency = {} es_params = {"_source_include": ",".join(INCLUDED_AGENCY_FIELDS)} try: ret_agency = jsearch.get_record(agency_id, doc_type='agencies', params=es_params) except Exception as e: return jsonify({"errors": "Not found"}), 404 if ret_agency: return ret_agency, 200 else: return jsonify({"errors": "Not found"}), 404
def get_entity_by_type_and_id(entity_type, entity_id): if entity_type not in ALLOWED_ENTITY_TYPES: return {"errors": "invalid type"} return jsearch.get_record(entity_id, entity_type)
def get_entity_from_es(user_followed_entity): return { 'entity': jsearch.get_record(user_followed_entity.entity_id, user_followed_entity.entity_type) }
def pop_annotation_job_from_queue(annotation_task_id, user_id): time_now = datetime.datetime.now() # grabs queued annotation jobs for this task that are assigned to the user (or nobody), # ordered first by whether they are have a user assignment, next by highest priority, # and finally falling back on the oldest created annotation_job = db_session_users.query(AnnotationJob).filter_by(annotation_task_id=annotation_task_id)\ .filter_by(status=AnnotationJob.QUEUED_STATUS)\ .filter(or_(AnnotationJob.user_id == user_id, AnnotationJob.user_id == None))\ .order_by(AnnotationJob.user_id.nullslast(), AnnotationJob.priority.desc(), AnnotationJob.created_at.asc()).first() # if by chance, we are in the period of time between when a task was updated, but before the next queuing run # came around, we want to make sure to look up annotation jobs for older annotation tasks too if annotation_job is None: old_annotation_task_ids = db_session_users.query(AnnotationTask.id).filter_by(active_task_id=annotation_task_id).subquery() annotation_job = db_session_users.query(AnnotationJob)\ .filter(AnnotationJob.annotation_task_id.in_(old_annotation_task_ids)) \ .filter_by(status=AnnotationJob.QUEUED_STATUS) \ .filter(or_(AnnotationJob.user_id == user_id, AnnotationJob.user_id == None)) \ .order_by(AnnotationJob.user_id.nullslast(), AnnotationJob.priority.desc(), AnnotationJob.created_at.asc()).first() if annotation_job is None: return {"annotation_job": None} annotation_job.status = AnnotationJob.ASSIGNED_STATUS annotation_job.user_id = user_id annotation_job.assigned_at = time_now db_session_users.add(annotation_job) db_session_users.commit() db_session_users.refresh(annotation_job) # n.b. mitigation strategy for race condition would look like: while the assigned user_id is not me -> query again # change status to error status if document is not found in index try: doc_dict = jsearch.get_record(annotation_job.doc_id) except NotFoundError: annotation_job.status = AnnotationJob.ERROR_STATUS annotation_job.notes = "Document is not found" db_session_users.add(annotation_job) db_session_users.commit() db_session_users.refresh(annotation_job) return {"errors": "Document is not found. Doc ID: " + str(annotation_job.doc_id)} # if this is training job, return info about correct judgment if annotation_job.is_gold_evaluation: # get gold judgment info to return with annotation_job object topic_group_id_subquery = db_session_users.query(AnnotationTask.annotation_task_topic_group_id)\ .filter_by(id=annotation_job.annotation_task_id)\ .subquery() # should contain just one result gold_judgment_id_subquery = db_session_users.query(AggregatedAnnotations.gold_topic_annotation_id)\ .filter_by(doc_id=annotation_job.doc_id)\ .filter(AggregatedAnnotations.annotation_task_group_id.in_(topic_group_id_subquery))\ .subquery() gold_judgment_object = db_session_users.query(TopicAnnotation.is_positive, TopicAnnotation.admin_notes)\ .filter(TopicAnnotation.id.in_(gold_judgment_id_subquery))\ .first() # this query should return just one object anyway return {'annotation_job': annotation_job.to_dict(), 'document': doc_dict, 'correct_judgment': gold_judgment_object.is_positive, 'correct_judgment_notes': gold_judgment_object.admin_notes} return {'annotation_job': annotation_job.to_dict(), 'document': doc_dict}
def activate_user(params): email = params.get('email') token = params.get('token') new_password = params.get('new_password') first_name = params.get('first_name') last_name = params.get('last_name') is_contributor = params.get('is_contributor') dry_run = params.get('dry_run', False) # validate token, email, enabled state only linkedin_id = params.get('linkedin_id') google_id = params.get('google_id') enabled = params.get('enabled') # params to go into json field in db json_params = [ 'agencies', 'state_agencies', 'other_agencies', 'other_state_agencies', 'other_topics', 'user_style' ] def error_response(msg='Invalid email or token'): response = jsonify({ 'error': msg, }) response.status_code = 400 return response # confirmation_required variable tracks whether this is an activation sourced from a marketing campaign, # a signup withouot a token, or from the invite -> activate flow. # use confirmation_required to indicate we need to send a confirmation email later on confirmation_required = False marketing_campaign = db_session_users.query(MarketingCampaign).filter_by(token=token).first() if marketing_campaign is not None or token is None: confirmation_required = True else: if email is None: return error_response() else: email = email.lower() g.user_email = email user = db_session_users.query(User).filter_by(email=email).scalar() if user is None: return error_response() if dry_run and user.enabled: return error_response('User is already enabled') enabled_at_start = user.enabled if not user.reset_token or user.reset_token != token: # send an email to support, but only if the user is in the db to prevent spamming if dry_run: template_vars = { 'email': email, } email_helper.send_email( '*****@*****.**', '*****@*****.**', 'A user attempted to use an invalid token during activation', template='activate-fail', vars=template_vars, ) return error_response() if dry_run: return jsonify({'marketing_campaign': marketing_campaign is not None}) if not new_password: return error_response('Missing fields') # for the marketing campaign approach, create an entry in the users table, # for the invite-based registration approach, mark the user enabled if confirmation_required: email = email.lower() g.user_email = email # check if this user exists in the database (the invite use-case), so we can use the existing entry if so # and create a new entry if not user = db_session_users.query(User).filter_by(email=email).first() # this is for when a user comes to our site without being invited through the admin tool if user is None: user = User({ 'email': email, 'first_name': first_name, 'last_name': last_name, 'password': new_password, 'enabled': False, }) # this is for when the user is instead invited to our site, but then instead of trying to enter via the # invitation link, they use the regular user signup flow. they will now get the confirmation email # and have to fully activate their account there else: # triple check to prevent any shenanigans for enabled users, or user accounts # that somehow exist but were not invited, and also if the invite has already been skipped # and we have successfully moved onto the confirmation step # n.b. relying on hash values is a little funky here, but it seems to work if user.enabled or "invited_by" not in user.properties or "invite_skipped" in user.properties: return error_response() user.properties["invite_skipped"] = True # n.b. record that the invite workflow was skipped user.first_name = first_name user.last_name = last_name user.update_password(new_password) if linkedin_id: user.linkedin = linkedin_id user.industry = params.get('industry') user.company = params.get('company') user.properties['linkedin_data'] = params.get('linkedin_data') user.enabled = enabled user.properties['confirmed_date'] = datetime.datetime.utcnow().isoformat() if google_id: user.google_id = google_id user.enabled = enabled user.properties['confirmed_date'] = datetime.datetime.utcnow().isoformat() # mark internal users with the internal user flag so we can differentiate user types when # calculating various stats if email.endswith("@jurispect.com") or email.endswith("@compliance.ai"): user.is_internal_user = True if marketing_campaign is not None: user.marketing_campaigns.append(marketing_campaign) user.gen_reset_token() enabled_at_start = False try: _send_activation_email('confirm', user) except SMTPException: db_session_users.rollback() return error_response('Could not send email', code=500) else: user.enabled = True user.update_password(new_password) if first_name: user.first_name = first_name if last_name: user.last_name = last_name # only allow the token to be used once: user.reset_token = None new_props = {p: params[p] for p in json_params if params.get(p)} # n.b. since this route is shared with password resets, we need to skip updating the activation time # when it is a password reset action if not enabled_at_start: new_props['activation_time'] = datetime.datetime.utcnow().isoformat() if not params.get('user_style') and email.endswith('@firstrepublic.com'): new_props['user_style'] = 'first-republic' if len(new_props) > 0: user.properties = merge_two_dicts(user.properties, new_props) if is_contributor: user.roles = ['contributor'] # FIXME: this is needed for marketing-campaign sourced users but yields a double commit # probably not super noticeable, but should fix if we have the time db_session_users.add(user) try: db_session_users.commit() except IntegrityError: return error_response() db_session_users.refresh(user) topic_ids = [] topic_ids.extend(params.get('topics', AggregatedAnnotations.topic_id_name_mapping.keys())) for topic_id in topic_ids: userTopic = UserTopic({ 'user_id': user.id, 'topic_id': topic_id, 'following': True }) db_session_users.add(userTopic) news_ids = [x['id'] for x in jsearch.query_records({'size': 1000}, doc_type='news_sources')] for news_id in news_ids: userFollowedEntity = UserFollowedEntity({ 'user_id': user.id, 'entity_id': news_id, 'entity_type': 'news_sources', 'following': True }) db_session_users.add(userFollowedEntity) agency_ids = [] agency_ids.extend(params.get('agencies', [])) new_ids = [] # verify that the agency ids are correct # using DefaultAgenciesToFollowAtSignup since users now skip onboarding for agency_id in DefaultAgenciesToFollowAtSignup: try: agency = jsearch.get_record(agency_id, 'agencies') new_ids.append(agency['id']) except NotFoundError: pass for agency_id in new_ids: user_agency = UserAgency({'user_id': user.id, 'agency_id': agency_id, 'following': True}) db_session_users.add(user_agency) state_jurisdictions = [] state_jurisdictions.extend(params.get('state_agencies', [])) state_ids = [] # get selected state jurisdiction ids and add them to follow entity table for state_jurisdiction in state_jurisdictions: try: state = get_state_by_short_name('jurisdictions', state_jurisdiction) state_ids.append(state['id']) except NotFoundError: pass updated_followed_entity(user.id, {'entities': [{ 'entity_id': state_id, 'entity_type': 'jurisdictions', 'following': True } for state_id in state_ids]}) # send a support mail if the user requests a new source other_agencies = params.get('other_agencies', '') other_state_agencies = params.get('other_state_agencies', '') other_topics = params.get('other_topics', '') if other_agencies or other_state_agencies or other_topics: template_vars = { 'other_agencies': other_agencies, 'other_state_agencies': other_state_agencies, 'other_topics': other_topics, 'name': '%s %s' % (first_name, last_name), 'email': email, } email_helper.send_email( '*****@*****.**', '*****@*****.**', 'A new user has requested additional sources or topics', template='additional-sources', vars=template_vars, ) try: db_session_users.commit() except IntegrityError: return error_response() # start free trials. user = db_session_users.query(User).filter_by(email=email.lower()).first() latest_subscription = db_session_users.query(Subscription).filter_by(user_id=user.id, latest=True).first() if latest_subscription is None: # new users with .edu email get a 120 month free trial. if user.email.endswith('.edu'): subscribe_users_to_plan([user.id],'free_trial_120months') # all other users get a 1 month free trial else: start_free_trial(user.id) create_folder(user.id, {'name': 'Read'}) create_folder(user.id, {'name': 'Bookmarked'}) if confirmation_required: # special case login for unenabled marketing campaign users allow access for 7 days only expiration_datetime = datetime.datetime.utcnow() + datetime.timedelta(days=7) token = jwt.encode({'user_id': user.id, 'exp': expiration_datetime}, SECRET_JWT) # Flag 'is_new' defines if user is returning or just registered. True means just registered user return jsonify({"jwt_token": token, "is_new": True, 'email': email.lower()}) else: # return empty if user not from marketing campaign return jsonify({})
if not user_interval_preference[opts.job_interval] and not opts.force: print(user.email + " is not configured for " + opts.job_interval) continue followed_agency_ids = get_followed_agency_ids_with_backoff(user.id) agency_overview = {} for followed_agency_id in followed_agency_ids: # n.b. need to figure out a better way to filter out state code if followed_agency_id > 999: continue # n.b. we need to rescue elasticsearch not found errors here because we don't guarantee consistency # between the followed agency ids stored in the user db and any updates in try: agency_name = jsearch.get_record(str(followed_agency_id), 'agencies')['short_name'] except elasticsearch.exceptions.NotFoundError: continue agency_overview[agency_name] = { "types": {}, "agency_id": followed_agency_id, "published_from": overview_from_date, "published_to": today_str } for doc_type in distinct_document_types: if doc_type == 'Enforcement Metadata' or doc_type == 'Mainstream News': continue params = MultiDict({
from lxml import etree from dateutil import parser from werkzeug.datastructures import MultiDict this_folder = os.path.dirname(os.path.realpath(__file__)) sys.path.append(this_folder + '/..') import schemas.jurasticsearch as jsearch from helpers.document_helper import get_filtered_documents distinct_document_types = jsearch.get_distinct_attribute_values('category') document_id = 1705526 comments_date = "01/17/2017" effective_date = None doc = jsearch.get_record(document_id) #json.dump(doc, open('socgen.json', 'wb')) document = etree.Element("document") def create_basic_doc(document, doc_dict): basic_fields = ['title', 'category', 'pdf_url', 'web_url'] for field_name in basic_fields: field_entry = etree.SubElement(document, field_name) field_entry.text = doc_dict[field_name] publication_date = etree.SubElement(document, "publication_date") publication_date.text = parser.parse(doc_dict['publication_date']).strftime("%m/%d/%Y") sources = etree.SubElement(document, "sources") for a in doc_dict['agencies']: source = etree.SubElement(sources, "source")