def get_time(): time = doc.xpath('//gml:begin/gml:TimeInstant/gml:timePosition',namespaces=nspace) #end = doc.xpath('/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:temporalElement/gmd:EX_TemporalExtent/gmd:extent/gml:TimePeriod/gml:begin/gml:TimeInstant/gml:timePosition',namespaces=nspace) try: t = common.time_coverage_fix(time[0].text,time[1].text) return (common.timefix(t[0]),common.timefix(t[1])) except IndexError, ValueError: return ("","")
def time_coverage_fix(self,time): #Fix dates try: t = common.time_coverage_fix(package_dict['time_period_coverage_start'],package_dict['time_period_coverage_end']) package_dict['time_period_coverage_start'] =common.timefix(t[0]) package_dict['time_period_coverage_end'] = common.timefix(t[1]) except KeyError: ''' Times were never set ''' package_dict['time_period_coverage_start'] ="1000-01-01" package_dict['time_period_coverage_end'] ="3000-01-01"
def time_and_space(self, pilot): #10 if 33 self.package_dict['date_published']=self.rules.format_date(pilot['date_released']) self.package_dict['date_modified']=self.rules.format_date(pilot['date_updated']) self.package_dict['maintenance_and_update_frequency']=self.rules.pilot_frequency_list[pilot['frequency']] self.package_dict['portal_release_date']='2013-06-18' self.package_dict['ready_to_publish']=False #Used to be validation_override=True t = common.time_coverage_fix(pilot['time_period_start'],pilot['time_period_end']) self.package_dict['time_period_coverage_start']=self.rules.format_date(t[0]) self.package_dict['time_period_coverage_end']=self.rules.format_date(t[1]) self.package_dict['geographic_region']=self.rules.geo_region(pilot['Geographic_Region_Name']) self.package_dict['spatial']='' self.package_dict['spatial_representation_type']=''
def process_node(self,count, node, language): try: id = str(node.xpath("FORM[NAME='thisformid']/A/text()")[0]).lower() # if id == "2da1db44-d00f-4764-8524-d42e3b798ce0": # print "STOP" except: print "======NO ID=========", node.xpath("DC.TITLE")[0].text try: geo_lower_left = node.xpath("FORM[NAME='geo_lower_left']/A/text()") geo_upper_right = node.xpath("FORM[NAME='geo_upper_right']/A/text()") spatial='' if geo_lower_left and geo_upper_right: left,bottom = geo_lower_left[0].split(" ") right, top = tuple(geo_upper_right[0].split(" ")) coordinates = [[left, bottom], [left,top], [right, top], [right, bottom]] spatial = {'type': 'Polygon', 'coordinates': coordinates} print spatial #sys.exit() except: print "NO GEO" #raise package_dict = {'resources': []} package_dict['resources'] = self.node_resources(node,language) package_dict['spatial']= spatial for ckan_name, pilot_name, field in schema_description.dataset_all_fields(): try: if ckan_name == "id": package_dict['id'] = str(node.xpath("FORM[NAME='thisformid']/A/text()")[0]).lower() continue elif ckan_name == 'name': continue elif ckan_name== 'tags': continue elif ckan_name == 'title': t = node.xpath("FORM[NAME='title_en']/A/text()")[0] package_dict['title'] = self.strip_title(t) if t == None: raise "No English Title", t continue elif ckan_name=='title_fra': # Look for t_fr = node.xpath("FORM[NAME='title_fr']/A/text()")[0] if t_fr == None: raise "No French Title", t_fr # Filter out -version anglaise etc for marker in common.language_markers_fra: if marker in t_fr: package_dict['title_fra'] = t_fr.split(marker)[0] break package_dict['title_fra'] = t_fr continue value ='' if pilot_name: if pilot_name=="url_fra": print pilot_name try: result = node.xpath("FORM[NAME='%s']/A/text()" % pilot_name) if result: value = result[0] else: value ='' except IndexError as e: print e if "|" in value: split_value = value.split("|")[1] rval = field['choices_by_pilot_uuid'][split_value] package_dict[ckan_name] = rval['key'] if pilot_name == "department": package_dict['owner_org'] = field['choices_by_pilot_uuid'][split_value]['key'] else: if pilot_name == 'frequency': if value: package_dict['maintenance_and_update_frequency'] = pilot_frequency_list[value] else: package_dict['maintenance_and_update_frequency'] = pilot_frequency_list[''] continue else: package_dict[ckan_name] = value except IndexError: #when None, eg. same as elif pilot_name is None: package_dict[ckan_name] = '' continue print count, "INDEX ERROR ", ckan_name, pilot_name,package_dict[pilot_name] except KeyError as e: print "KEY ERROR : ", ckan_name, pilot_name, e package_dict[ckan_name] = '' continue # Filter out things that will not pass validatation if package_dict['geographic_region'] == "Canada Canada":package_dict['geographic_region']='' region = package_dict['geographic_region'] package_dict['geographic_region'] = region.replace("Yukon Territory Territoire du Yukon","Yukon Yukon" ) package_dict['author_email'] = '*****@*****.**' package_dict['catalog_type'] = schema_description.dataset_field_by_id['catalog_type']['choices'][0]['key'] #Override validation package_dict['validation_override']=validation_override #Fix dates try: t = common.time_coverage_fix(package_dict['time_period_coverage_start'],package_dict['time_period_coverage_end']) package_dict['time_period_coverage_start'] =common.timefix(t[0]) package_dict['time_period_coverage_end'] = common.timefix(t[1]) except KeyError: ''' Times were never set ''' package_dict['time_period_coverage_start'] ="1000-01-01" package_dict['time_period_coverage_end'] ="3000-01-01" package_dict['date_published'] = package_dict['date_published'].replace("/", "-") package_dict['time_period_coverage_start']=check_date(package_dict['time_period_coverage_start']) package_dict['time_period_coverage_end']=check_date(package_dict['time_period_coverage_end']) package_dict['date_published']=check_date(package_dict['date_published']) package_dict['portal_release_date']='2013-05-24' if node.find("FLOWSTATUS").text == "pending": package_dict['portal_release_date']='' package_dict['ready_to_publish']=True package_dict['license_id']='ca-ogl-lgo' #if count>1200:sys.exit() def reformat_date(date_string): try: timepoint = datetime.strptime(date_string.strip(), "%m/%d/%Y") except ValueError: timepoint = datetime.strptime(date_string.strip(), "%Y/%m/%d") day = timepoint.date() return day.isoformat() if "/" in package_dict['date_modified']: package_dict['date_modified']=reformat_date(package_dict['date_modified']) key_eng = package_dict['keywords'].replace("\n"," ").replace("/","-").replace("(","").replace(")","").replace(":","-").replace(u"´","'").split(",") key_fra = package_dict['keywords_fra'].replace("\n"," ").replace("/","-").replace('"','').replace("(",""). replace(":","-").replace(")","").split(",") package_dict['keywords'] = ",".join([k.strip() for k in key_eng if len(k)<100 and len(k)>1]) package_dict['keywords_fra'] = ",".join([k for k in key_fra if len(k)<100 and len(k)>1]) if package_dict['owner_org']=='aafc-aac': for marker in agriculture_title_markers: if marker in package_dict['title']: new = package_dict['title'].split(marker)[1] package_dict['title']=new.lstrip(" ") break for marker in agriculture_title_markers: if marker in package_dict['title_fra']: new_fr = package_dict['title_fra'].split(marker)[1] package_dict['title_fra']=new_fr.lstrip(" ") break if package_dict['owner_org']=='hc-sc': for resource in package_dict['resources']: if resource['resource_type']=='file': resource['format']='TXT' #print count,package_dict['title'], len(package_dict['resources']) return package_dict