Example #1
0
 def check_structure(dict):
     fields =  [ckan for ckan,pilot,field in schema.dataset_all_fields() if field['type'] not in [u'fixed',u'calculated']] 
     mandatory = [ckan for ckan,pilot,field in schema.dataset_all_fields() if field['mandatory'] == u'all']  
     fields.append('resources')  
     fields.append('validation_override')     
     missing_fields = set(dict.iterkeys()).symmetric_difference(set(fields)) 
     
     mandatory_fields = set(mandatory).intersection(set(fields))
     print "Missing Mandatory Fields", missing_fields.intersection(mandatory_fields)
     print "Missing Values ", [key for key,value in dict.items() if value=='MISSING']
     print "------------- Details ---------------"
     print "Fields Missing from Package_dict"
     pprint(list(missing_fields))
     
     print "Mandatory Fields that are not fixed or calculated"
     pprint(mandatory_fields)
Example #2
0
def table():
    Recs = []
    t = PrettyTable(['No.','CKAN Name','Description','Pilot Name'])
    t.align["City name"] = "l" # Left align city names
    t.padding_width = 1 # One space between column edges and contents (default)
    print u'\u2019'.encode('utf-8') 
    for i, (ckan_name, pilot_name, field) in enumerate(schema_description.dataset_all_fields()):
        description = field['description']['eng']
        #description = field['description']['eng'].replace(u'\u2019','') #Fix bad windows chars
        t.add_row([i,str(ckan_name),
                     description,
                     str(pilot_name)
                     ])
    t.align='l'
    print t
Example #3
0
    def _process_node(self,node):
        #print node.xpath("FORM[NAME='thisformid']/A/text()")
        #print etree.tostring(node, with_tail=True)
        
        
        package_dict = {'resources': [], 'tags':[]}
        
        for ckan_name, pilot_name, field in schema_description.dataset_all_fields():
            
            
            try: # the simplest case, one to one mapping of values
            # temporary hack because name has not been mapped to thisformid in the schema
                if ckan_name == "id":
                    #package_dict['id'] = node.xpath("FORM[NAME='thisformid']/A/text()")[0]
                    continue
                elif ckan_name == 'name':
                    package_dict['name'] = "pilot-" + str(node.xpath("FORM[NAME='thisformid']/A/text()")[0]).split("-")[0].lower()
                    continue
                elif ckan_name== 'tags':
                    continue
                else:
                    print ckan_name, pilot_name
                    value = node.xpath("FORM[NAME='%s']/A/text()" % pilot_name)[0]

                if "|" in value:
                    split_value = value.split("|")[1]
                    rval = field['choices_by_pilot_uuid'][split_value]
                    package_dict[ckan_name] = rval['key']
                else:
                    package_dict[ckan_name] = value
            except IndexError:  #when None, eg. same as elif pilot_name is None:
               package_dict[ckan_name] = ''
               print "INDEX ERROR"
               pass
            except KeyError:
                print "KEY ERROR"
                
                pass
            
            # now do resources, use my own logic as 
            resources = []
            resource_dict = {}
            if ckan_name in schema_description.all_resource_fields:
                try:
                    value  = node.xpath("FORM[NAME='%s']/A/text()" % pilot_name)[0]
           
                    if "|" in value:
                        split_value = value.split("|")[1]
                        rval = field['choices_by_pilot_uuid'][split_value]
                        resource_dict[ckan_name] = rval['key']
                    else: 
                        resource_dict[ckan_name] = value
                        
                except IndexError:
                        continue
                    
                resources.append(resource_dict)
            
                package_dict['resources'] = resources 
            
            
            
        pprint(package_dict)
        #sys.exit()
        self.out.write(json.dumps(package_dict) + "\n")
Example #4
0
def process_record(node):
    package_dict = {'resources': [], 'tags': []}
    data = {}
    extras = {}
    resource={}
    resources=[]
    
    for ckan_name, pilot_name, field in schema_description.dataset_all_fields():
        try: # the simplest case, one to one mapping of values
            # temporary hack because name has not been mapped to thisformid in the schema
            
            value = node.xpath("FORM[NAME='%s']/A/text()" % pilot_name)[0]
            pilot_code = value.split('|')
            if pilot_code[0] in mappings.code_mapping_strategies:
                data[ckan_name] = mappings.code_mapping_strategies[pilot_code[0]](pilot_code[1])
            else:

                data[ckan_name] = value
        except UnicodeDecodeError:
            print "UNICODE ERROR"
        except IndexError: #same as elif pilot_name is None:
            if ckan_name == "name": 
                data['name'] = "statcan-" + mappings.random_id()
                print 
            elif ckan_name in mappings.default_strategies:
                data[ckan_name] = mappings.default_strategies[ckan_name]()

            else:
                data[ckan_name] = "default_" + ckan_name
     
        finally: 
            # reorganize dict for CKAN
            if ckan_name in schema_description.extra_package_fields:
               extras[ckan_name] = data[ckan_name]
               del data[ckan_name]  
            # now populate packages
            elif ckan_name == 'url':
                resource['url'] = "http://www.statcan.gc.ca/cgi-bin/sum-som/fl/cstsaveascsv.cgi?filename=arts63a-fra.htm&lan=fre"
                #resource['url'] = node.xpath("FORM['dataset_link_en_1']/A/text()")[0]
            
            elif ckan_name in schema_description.all_resource_fields:
                resource[ckan_name] = "default_package_value " + ckan_name
         
    
    resources.append(resource)         
    data['extras'] = extras
    data['resources'] = resources
    data['groups'] = ["statcan"]
   
#   extras = {key:value for (key, value) in data if key in schema_description.extra_package_fields}
    s = "some\x00string. with\x15 funny characters"
    foo = filter(lambda x: x in string.printable, s)
    print foo
    valid_utf8 = True
    try:
        foo.decode('utf-8')
    except UnicodeDecodeError:
        valid_utf8 = False
    print valid_utf8
    whatisthis(data)
    data2=json.dumps(data,encoding="utf-8")

    whatisthis(data2)
    data3 = filter(lambda x: x in string.printable, s)
    whatisthis(data3)
    #data = json.dumps(data)
    with open('/Users/peder/Desktop/data.json','w') as outfile:
        json.dump(data,outfile)
 
    sys.exit()
Example #5
0
    def _parse_fields(self,node):
        ''' package fields '''
        for ckan_name, pilot_name, field in schema_description.dataset_all_fields():
            if pilot_name:
                path = "FORM[NAME='%s']/A/text()"%pilot_name
                element = node.xpath(path)
                
                try:
                    # Deal with Pilot UUID CODES
                    value = element[0].strip()
                    if "|" in value:
                        split_value=value.split("|")[1]
                        self.fields[pilot_name] = field['choices_by_pilot_uuid'][split_value]['key']

                    else:
                        self.fields[pilot_name]=value
                except IndexError:
                    self.fields[pilot_name]=""
            else:
                # ckan_name / field does not belong at PilotRecord level.  Process in CanadaRecord
                pass
        ''' Grab data that is not defined in schema '''
        self.fields['language'] = common.language(node)
        

        geo_lower_left = node.xpath("FORM[NAME='geo_lower_left']/A/text()")
        geo_upper_right = node.xpath("FORM[NAME='geo_upper_right']/A/text()") 
        if geo_lower_left and geo_upper_right and  geo_lower_left[0] != "N/A":
            print "GEO ", geo_upper_right, geo_lower_left       
            try:
                left,bottom = geo_lower_left[0].split(" ")
                right, top = geo_upper_right[0].split(" ")
            except ValueError:
                '''  To catch values that have a dash that should perhaps be a minus  ['84 - 43'] ['41.5 - 141']  '''
                left,bottom = geo_lower_left[0].replace(" - "," -").split(" -")
                right, top = geo_upper_right[0].replace(" - "," -").split(" -")
                
            coordinates = [[left, bottom], [left,top], [right, top], [right, bottom]]
            self.fields['spatial']= {'type': 'Polygon', 'coordinates': coordinates}  
            
            
        ''' resources'''
        try:
            for i in range(1,5):
                url = node.xpath("FORM[NAME='dataset_link_en_%d']/A/text()" % i)
                if url:
                    resource_dict = {}
                    resource_dict['url']=url[0]
                    if "http://data.gc.ca/commonwebsol/fileuploads/C/4/0/C4060F22-17EB-450D-9B5E-A1216E75DF47/Dictionnaire" in resource_dict['url']:
                        print "STOP"
                    # Force a language from parent
                    resource_dict['language'] = self.fields['language']
                    format = node.xpath("FORM[NAME='dataset_format_%d']/A/text()" % i)
                    size = node.xpath("FORM[NAME='dataset_size_%d']/A/text()" % i)
                  
                    if format:resource_dict['format']=format[0].split("|")[1]
                    self.resources.append(PilotResource(resource_dict,'dataset_link_en_'))
                else:               
                    break
     
            extras=['supplementary_documentation_en',
                    'supplementary_documentation_fr',
                    'data_dictionary_fr',
                    'dictionary_list:_en']
    
            
            for extra in extras:
                url= node.xpath("FORM[NAME='%s']/A/text()"% extra)
                if url:
                    resource_dict = {}
                    resource_dict['url']=url[0]    
                    self.resources.append(PilotResource(resource_dict,extra))
        except:
            raise
Example #6
0
    def process_node(self,count, node, language):

        try:
            id = str(node.xpath("FORM[NAME='thisformid']/A/text()")[0]).lower() 
#            if id == "2da1db44-d00f-4764-8524-d42e3b798ce0":
#                print "STOP"

        except:
            print "======NO ID=========", node.xpath("DC.TITLE")[0].text
        
        try:
            
            geo_lower_left = node.xpath("FORM[NAME='geo_lower_left']/A/text()")
            geo_upper_right = node.xpath("FORM[NAME='geo_upper_right']/A/text()") 
           
            spatial=''
            if geo_lower_left and geo_upper_right:

                 left,bottom = geo_lower_left[0].split(" ")
                 right, top = tuple(geo_upper_right[0].split(" "))
                 coordinates = [[left, bottom], [left,top], [right, top], [right, bottom]]
                 spatial = {'type': 'Polygon', 'coordinates': coordinates}  
                 print spatial          
                 #sys.exit()
        except:
            print "NO GEO"
            #raise
            

        package_dict = {'resources': []}
        
        package_dict['resources']  = self.node_resources(node,language)
        package_dict['spatial']= spatial

        for ckan_name, pilot_name, field in schema_description.dataset_all_fields():
          
            try:
                     
                if ckan_name == "id":
                    package_dict['id'] =  str(node.xpath("FORM[NAME='thisformid']/A/text()")[0]).lower() 
                    continue

                elif ckan_name == 'name':
                
                    continue

                elif ckan_name== 'tags':
                    continue
                elif ckan_name == 'title':

                    t = node.xpath("FORM[NAME='title_en']/A/text()")[0]
                    
                    package_dict['title'] =  self.strip_title(t)
                    if t == None: raise "No English Title", t
                    continue
                    
                elif ckan_name=='title_fra':
                    # Look for 
                    t_fr = node.xpath("FORM[NAME='title_fr']/A/text()")[0]
                    if t_fr == None: raise "No French Title", t_fr
                    # Filter out -version anglaise etc
                    for marker in common.language_markers_fra:
                        if marker in t_fr:
                            package_dict['title_fra'] = t_fr.split(marker)[0]
                            break
                        package_dict['title_fra'] =  t_fr
                    continue
        
                value =''
                if pilot_name:
                    if pilot_name=="url_fra": 
                        print pilot_name
                    try: 
                        
                        result = node.xpath("FORM[NAME='%s']/A/text()" % pilot_name)
                        if result:
                            value  = result[0]
                        else: 
                            value =''
                  
                    except IndexError as e:
                            print e
     
                if "|" in value:
                    split_value = value.split("|")[1]
                    rval = field['choices_by_pilot_uuid'][split_value]
                    package_dict[ckan_name] = rval['key']
                    
                    if pilot_name == "department":
                        package_dict['owner_org'] = field['choices_by_pilot_uuid'][split_value]['key']
                        
                else:
                    if pilot_name == 'frequency':
                        if value:
                            package_dict['maintenance_and_update_frequency'] = pilot_frequency_list[value]
                        else:
                            package_dict['maintenance_and_update_frequency'] = pilot_frequency_list['']
                        continue
                    else:

                        package_dict[ckan_name] = value
            except IndexError:  #when None, eg. same as elif pilot_name is None:
               package_dict[ckan_name] = ''

               continue
               print count, "INDEX ERROR ", ckan_name, pilot_name,package_dict[pilot_name]
               
            except KeyError as e:
                print "KEY ERROR : ", ckan_name, pilot_name, e 
                package_dict[ckan_name] = ''
                continue
                

        # Filter out things that will not pass validatation
        if package_dict['geographic_region'] == "Canada  Canada":package_dict['geographic_region']=''
        region = package_dict['geographic_region']
        package_dict['geographic_region'] = region.replace("Yukon Territory  Territoire du Yukon","Yukon  Yukon" )
        package_dict['author_email'] =  '*****@*****.**'  
        package_dict['catalog_type'] = schema_description.dataset_field_by_id['catalog_type']['choices'][0]['key']
        #Override validation
        package_dict['validation_override']=validation_override
        #Fix dates
        try:

            t = common.time_coverage_fix(package_dict['time_period_coverage_start'],package_dict['time_period_coverage_end'])
           
            package_dict['time_period_coverage_start'] =common.timefix(t[0])
            package_dict['time_period_coverage_end'] = common.timefix(t[1])
             
        except KeyError:
            ''' Times were never set '''
            package_dict['time_period_coverage_start'] ="1000-01-01"
            package_dict['time_period_coverage_end']  ="3000-01-01"
            
      
        package_dict['date_published'] = package_dict['date_published'].replace("/", "-")
        package_dict['time_period_coverage_start']=check_date(package_dict['time_period_coverage_start'])
        package_dict['time_period_coverage_end']=check_date(package_dict['time_period_coverage_end'])
        package_dict['date_published']=check_date(package_dict['date_published'])
        package_dict['portal_release_date']='2013-05-24'
        if node.find("FLOWSTATUS").text == "pending":
            package_dict['portal_release_date']=''
 
        package_dict['ready_to_publish']=True
        
        
        package_dict['license_id']='ca-ogl-lgo'
        #if count>1200:sys.exit()
        def reformat_date(date_string):
            try:
                timepoint = datetime.strptime(date_string.strip(), "%m/%d/%Y") 
            except ValueError:
                timepoint = datetime.strptime(date_string.strip(), "%Y/%m/%d")
            day = timepoint.date()
            return day.isoformat()

        if "/" in package_dict['date_modified']: package_dict['date_modified']=reformat_date(package_dict['date_modified'])
        key_eng = package_dict['keywords'].replace("\n"," ").replace("/","-").replace("(","").replace(")","").replace(":","-").replace(u"ยด","'").split(",")
        key_fra = package_dict['keywords_fra'].replace("\n"," ").replace("/","-").replace('"','').replace("(","").  replace(":","-").replace(")","").split(",")

        package_dict['keywords'] = ",".join([k.strip() for k in key_eng if len(k)<100 and len(k)>1])
        package_dict['keywords_fra'] = ",".join([k for k in key_fra if len(k)<100 and len(k)>1])

        if package_dict['owner_org']=='aafc-aac':
            for marker in agriculture_title_markers:
                if marker in package_dict['title']:
                    new = package_dict['title'].split(marker)[1]
                    package_dict['title']=new.lstrip(" ")
                    break
                    
            for marker in agriculture_title_markers:
                if marker in package_dict['title_fra']:
                    new_fr = package_dict['title_fra'].split(marker)[1]
                    package_dict['title_fra']=new_fr.lstrip(" ")
                    break
         
            if package_dict['owner_org']=='hc-sc':
                for resource in package_dict['resources']:
                    if resource['resource_type']=='file':
                        resource['format']='TXT'

                            
        #print count,package_dict['title'], len(package_dict['resources'])
        return package_dict