def get_data(src, act_ids, url): """ FUNCTION get data of an act from a source in parameter PARAMETERS src: source (eurlex or oeil) [string] act_ids: dictionary of act ids for each source [dictionary of ActIds instances] url: link to the act page [string] RETURN fields: dictionary which contains retrieved data for a given source [dictionary] dg_names: list of dg names [list of strings] resp_names: list of resp names [list of strings] """ logger.debug("get_data") fields = {} dg_names = [None] * nb_dgs resp_names = [None] * nb_resps ok = False logger.debug("get_url_content_" + src) if src == "eurlex": url_content = [get_url_content_eurlex(url[0]), get_url_content_eurlex(url[1])] if url_content[0] is not False: ok = True elif src == "oeil": # oeil url_content = get_url_content_oeil(url) if url_content is not False: ok = True # if the url exists and there is a valid content if ok: setattr(act_ids[src], "url_exists", True) fields, dg_names, resp_names = eval("get_data_" + src)(url_content, act_ids["index"]) else: setattr(act_ids[src], "url_exists", False) logger.debug("error while retrieving " + src + " url") print "error while retrieving " + src + " url" # update url exist attribute logger.debug("act_ids to be saved") act_ids[src].save() return fields, dg_names, resp_names
def handle(self, **options): #get type_act for acts of 2014 NOT YET VALIDATED AND VALIDATE THEM for the statistical analysis for act in Act.objects.filter(type_acte__isnull=True, validated=2, releve_annee=2014): print act #url content no_celex=ActIds.objects.get(src="index", act=act).no_celex url=get_url_eurlex(no_celex) soup=get_url_content_eurlex(url) #type acte act.type_acte=get_type_acte(soup) act.save()
def handle(self, **options): for act_ids in ActIds.objects.filter(src="index", act__validated=2, act__releve_annee=2013): act=act_ids.act print act print "act.adopt_propos_origine", act.adopt_propos_origine url=get_url_eurlex(act_ids.no_celex, tab="HIS") soup_his=get_url_content_eurlex(url) soup_his=soup_his.find("div", {"class": "tabContent"}) #remove script tags [s.extract() for s in soup_his('script')] adopt_propos_origine=get_adopt_propos_origine(soup_his, act_ids.propos_origine) print "adopt_propos_origine", adopt_propos_origine if str(adopt_propos_origine)!=str(act.adopt_propos_origine): print "DIFFERENT" break
def handle(self, **options): for act_ids in ActIds.objects.filter(src="index", act__validated=2, act__date_cons_a__isnull=True, act__date_cons_b__isnull=True, act__releve_annee__in=[1996,2013,2014]): act=act_ids.act print act url=get_url_eurlex(act_ids.no_celex, tab="HIS") soup_his=get_url_content_eurlex(url) soup_his=soup_his.find("div", {"class": "tabContent"}) #remove script tags [s.extract() for s in soup_his('script')] point_b_tables=get_point_b_tables(soup_his, act_ids.propos_origine) act.date_cons_b=get_date_cons_b(point_b_tables) print "date_cons_b", act.date_cons_b point_a_tables=get_point_a_tables(soup_his, act_ids.propos_origine) act.date_cons_a=get_date_cons_a(point_a_tables) print "date_cons_a", act.date_cons_a act.save()