Ejemplo n.º 1
0
def get_time():
    time = doc.xpath('//gml:begin/gml:TimeInstant/gml:timePosition',namespaces=nspace)
    #end = doc.xpath('/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:temporalElement/gmd:EX_TemporalExtent/gmd:extent/gml:TimePeriod/gml:begin/gml:TimeInstant/gml:timePosition',namespaces=nspace)

    try:         
        t = common.time_coverage_fix(time[0].text,time[1].text)
        return (common.timefix(t[0]),common.timefix(t[1]))

    except IndexError, ValueError:
        return ("","")
Ejemplo n.º 2
0
    def time_coverage_fix(self,time):
    #Fix dates
        try:

            t = common.time_coverage_fix(package_dict['time_period_coverage_start'],package_dict['time_period_coverage_end'])
            package_dict['time_period_coverage_start'] =common.timefix(t[0])
            package_dict['time_period_coverage_end'] = common.timefix(t[1])
             
        except KeyError:
            ''' Times were never set '''
            package_dict['time_period_coverage_start'] ="1000-01-01"
            package_dict['time_period_coverage_end']  ="3000-01-01"
Ejemplo n.º 3
0
    def process_node(self,count, node, language):

        try:
            id = str(node.xpath("FORM[NAME='thisformid']/A/text()")[0]).lower() 
#            if id == "2da1db44-d00f-4764-8524-d42e3b798ce0":
#                print "STOP"

        except:
            print "======NO ID=========", node.xpath("DC.TITLE")[0].text
        
        try:
            
            geo_lower_left = node.xpath("FORM[NAME='geo_lower_left']/A/text()")
            geo_upper_right = node.xpath("FORM[NAME='geo_upper_right']/A/text()") 
           
            spatial=''
            if geo_lower_left and geo_upper_right:

                 left,bottom = geo_lower_left[0].split(" ")
                 right, top = tuple(geo_upper_right[0].split(" "))
                 coordinates = [[left, bottom], [left,top], [right, top], [right, bottom]]
                 spatial = {'type': 'Polygon', 'coordinates': coordinates}  
                 print spatial          
                 #sys.exit()
        except:
            print "NO GEO"
            #raise
            

        package_dict = {'resources': []}
        
        package_dict['resources']  = self.node_resources(node,language)
        package_dict['spatial']= spatial

        for ckan_name, pilot_name, field in schema_description.dataset_all_fields():
          
            try:
                     
                if ckan_name == "id":
                    package_dict['id'] =  str(node.xpath("FORM[NAME='thisformid']/A/text()")[0]).lower() 
                    continue

                elif ckan_name == 'name':
                
                    continue

                elif ckan_name== 'tags':
                    continue
                elif ckan_name == 'title':

                    t = node.xpath("FORM[NAME='title_en']/A/text()")[0]
                    
                    package_dict['title'] =  self.strip_title(t)
                    if t == None: raise "No English Title", t
                    continue
                    
                elif ckan_name=='title_fra':
                    # Look for 
                    t_fr = node.xpath("FORM[NAME='title_fr']/A/text()")[0]
                    if t_fr == None: raise "No French Title", t_fr
                    # Filter out -version anglaise etc
                    for marker in common.language_markers_fra:
                        if marker in t_fr:
                            package_dict['title_fra'] = t_fr.split(marker)[0]
                            break
                        package_dict['title_fra'] =  t_fr
                    continue
        
                value =''
                if pilot_name:
                    if pilot_name=="url_fra": 
                        print pilot_name
                    try: 
                        
                        result = node.xpath("FORM[NAME='%s']/A/text()" % pilot_name)
                        if result:
                            value  = result[0]
                        else: 
                            value =''
                  
                    except IndexError as e:
                            print e
     
                if "|" in value:
                    split_value = value.split("|")[1]
                    rval = field['choices_by_pilot_uuid'][split_value]
                    package_dict[ckan_name] = rval['key']
                    
                    if pilot_name == "department":
                        package_dict['owner_org'] = field['choices_by_pilot_uuid'][split_value]['key']
                        
                else:
                    if pilot_name == 'frequency':
                        if value:
                            package_dict['maintenance_and_update_frequency'] = pilot_frequency_list[value]
                        else:
                            package_dict['maintenance_and_update_frequency'] = pilot_frequency_list['']
                        continue
                    else:

                        package_dict[ckan_name] = value
            except IndexError:  #when None, eg. same as elif pilot_name is None:
               package_dict[ckan_name] = ''

               continue
               print count, "INDEX ERROR ", ckan_name, pilot_name,package_dict[pilot_name]
               
            except KeyError as e:
                print "KEY ERROR : ", ckan_name, pilot_name, e 
                package_dict[ckan_name] = ''
                continue
                

        # Filter out things that will not pass validatation
        if package_dict['geographic_region'] == "Canada  Canada":package_dict['geographic_region']=''
        region = package_dict['geographic_region']
        package_dict['geographic_region'] = region.replace("Yukon Territory  Territoire du Yukon","Yukon  Yukon" )
        package_dict['author_email'] =  '*****@*****.**'  
        package_dict['catalog_type'] = schema_description.dataset_field_by_id['catalog_type']['choices'][0]['key']
        #Override validation
        package_dict['validation_override']=validation_override
        #Fix dates
        try:

            t = common.time_coverage_fix(package_dict['time_period_coverage_start'],package_dict['time_period_coverage_end'])
           
            package_dict['time_period_coverage_start'] =common.timefix(t[0])
            package_dict['time_period_coverage_end'] = common.timefix(t[1])
             
        except KeyError:
            ''' Times were never set '''
            package_dict['time_period_coverage_start'] ="1000-01-01"
            package_dict['time_period_coverage_end']  ="3000-01-01"
            
      
        package_dict['date_published'] = package_dict['date_published'].replace("/", "-")
        package_dict['time_period_coverage_start']=check_date(package_dict['time_period_coverage_start'])
        package_dict['time_period_coverage_end']=check_date(package_dict['time_period_coverage_end'])
        package_dict['date_published']=check_date(package_dict['date_published'])
        package_dict['portal_release_date']='2013-05-24'
        if node.find("FLOWSTATUS").text == "pending":
            package_dict['portal_release_date']=''
 
        package_dict['ready_to_publish']=True
        
        
        package_dict['license_id']='ca-ogl-lgo'
        #if count>1200:sys.exit()
        def reformat_date(date_string):
            try:
                timepoint = datetime.strptime(date_string.strip(), "%m/%d/%Y") 
            except ValueError:
                timepoint = datetime.strptime(date_string.strip(), "%Y/%m/%d")
            day = timepoint.date()
            return day.isoformat()

        if "/" in package_dict['date_modified']: package_dict['date_modified']=reformat_date(package_dict['date_modified'])
        key_eng = package_dict['keywords'].replace("\n"," ").replace("/","-").replace("(","").replace(")","").replace(":","-").replace(u"´","'").split(",")
        key_fra = package_dict['keywords_fra'].replace("\n"," ").replace("/","-").replace('"','').replace("(","").  replace(":","-").replace(")","").split(",")

        package_dict['keywords'] = ",".join([k.strip() for k in key_eng if len(k)<100 and len(k)>1])
        package_dict['keywords_fra'] = ",".join([k for k in key_fra if len(k)<100 and len(k)>1])

        if package_dict['owner_org']=='aafc-aac':
            for marker in agriculture_title_markers:
                if marker in package_dict['title']:
                    new = package_dict['title'].split(marker)[1]
                    package_dict['title']=new.lstrip(" ")
                    break
                    
            for marker in agriculture_title_markers:
                if marker in package_dict['title_fra']:
                    new_fr = package_dict['title_fra'].split(marker)[1]
                    package_dict['title_fra']=new_fr.lstrip(" ")
                    break
         
            if package_dict['owner_org']=='hc-sc':
                for resource in package_dict['resources']:
                    if resource['resource_type']=='file':
                        resource['format']='TXT'

                            
        #print count,package_dict['title'], len(package_dict['resources'])
        return package_dict