Beispiel #1
0
def check_language(file):
    ''' test to make sure all not_in_new files crept into jl files becaues of order problem
        because they are actually french (no french ids should be in the .jl file)  
    '''
    not_in_new = pickle.load(open('not_in_new.pkl','rb'))
    final  = etree.parse(file).getroot()

    for i,child in enumerate(final):
        try:
            formid = str(child.xpath("FORM[NAME='thisformid']/A/text()")[0]).lower()
            if formid in not_in_new:
                print common.language(child)
        except IndexError:
            print "SMALL NO FORM ID", child.xpath("FORM[NAME='thisformid']/A/text()")
            #raise
    print "Conlusion, all records have french primary ids, and thus must be removed"
Beispiel #2
0
    def process_doubles(self, datafile):
         tree = etree.parse(datafile)
         root = tree.getroot()

         for i,pair in enumerate(self.combined_elements(root)):

            node_en = pair[0]
            lang = common.language(node_en)
            if lang == "English":
                pass
                #continue
            
            # MAKE SURE It's english and that the order in pilot-matched.xml has not been broken

            node_fr = pair[1]
     
            package_en = Transform().process_node(i,node_en, "eng; CAN")
            package_fr = Transform().process_node(i,node_fr,u"fra; CAN")

            # Transfer French Data to English Package
            for pack in  package_fr['resources']:
                if pack['format'] != "HTML":
                    package_en['resources'].append(pack)
              
            if package_en['resources'] == []:
                raise Exception

            if not package_en['owner_org']:
                print "############### NO ORGANIZATION ###########", package_en['id']
            
            elif not package_en['title']:
                print "############### NO TITLE ###########", package_en['id']


            elif not package_en['id']:
                "############ NO ID ###########",package_en['id']

            else:
                print i, "OK",package_en['id']
                print package_en['title']
                print package_en['title_fra']
                
                #if i>80:sys.exit()
                if package_fr['id']  == "dafb6413-5dab-45ca-bcd2-8c6ff4b67be5":
                    print "STOP"
                self.outfile.write(json.dumps(package_en) + "\n")
Beispiel #3
0
def find_missing_bilingual(file1,file2):
    cnt=Counter()
    tree1 = etree.parse(file1)
    tree2 = etree.parse(file2)
    root1 = tree1.getroot()
    root2 = tree2.getroot()
    ids_bi=[]
    ids_source=[]
    
    for i,child in enumerate(root1):
        
        try:
            formid = str(child.xpath("FORM[NAME='thisformid']/A/text()")[0]).lower()
        except:
            print "SMALL NO FORM ID"
        try:
            lang=language(child)
            if lang=="Bilingual":
                ids_bi.append(formid)
        except:
            print "small NO Language"
               
    for i,child in enumerate(root2):
        try:
            formid = str(child.xpath("FORM[NAME='thisformid']/A/text()")[0]).lower()
        except:
            print "BIG NO FORM ID"
            print child.find("DC.TITLE").text
            print child.xpath("FORM[NAME='title_en']/A/text()")
            #print etree.tostring(child)
        try:
            lang=common.language(child)
            if lang=="Bilingual":
                ids_source.append(formid)
        except:
            print "small NO Language"  
        
    biset= set(ids_bi)
    fullset = set(ids_source)
    print biset.issubset(fullset)
    diff = fullset.difference(biset)
    print diff
    def _parse_fields(self,node):
        ''' package fields '''
        for ckan_name, pilot_name, field in schema_description.dataset_all_fields():
            if pilot_name:
                path = "FORM[NAME='%s']/A/text()"%pilot_name
                element = node.xpath(path)
                
                try:
                    # Deal with Pilot UUID CODES
                    value = element[0].strip()
                    if "|" in value:
                        split_value=value.split("|")[1]
                        self.fields[pilot_name] = field['choices_by_pilot_uuid'][split_value]['key']

                    else:
                        self.fields[pilot_name]=value
                except IndexError:
                    self.fields[pilot_name]=""
            else:
                # ckan_name / field does not belong at PilotRecord level.  Process in CanadaRecord
                pass
        ''' Grab data that is not defined in schema '''
        self.fields['language'] = common.language(node)
        

        geo_lower_left = node.xpath("FORM[NAME='geo_lower_left']/A/text()")
        geo_upper_right = node.xpath("FORM[NAME='geo_upper_right']/A/text()") 
        if geo_lower_left and geo_upper_right and  geo_lower_left[0] != "N/A":
            print "GEO ", geo_upper_right, geo_lower_left       
            try:
                left,bottom = geo_lower_left[0].split(" ")
                right, top = geo_upper_right[0].split(" ")
            except ValueError:
                '''  To catch values that have a dash that should perhaps be a minus  ['84 - 43'] ['41.5 - 141']  '''
                left,bottom = geo_lower_left[0].replace(" - "," -").split(" -")
                right, top = geo_upper_right[0].replace(" - "," -").split(" -")
                
            coordinates = [[left, bottom], [left,top], [right, top], [right, bottom]]
            self.fields['spatial']= {'type': 'Polygon', 'coordinates': coordinates}  
            
            
        ''' resources'''
        try:
            for i in range(1,5):
                url = node.xpath("FORM[NAME='dataset_link_en_%d']/A/text()" % i)
                if url:
                    resource_dict = {}
                    resource_dict['url']=url[0]
                    if "http://data.gc.ca/commonwebsol/fileuploads/C/4/0/C4060F22-17EB-450D-9B5E-A1216E75DF47/Dictionnaire" in resource_dict['url']:
                        print "STOP"
                    # Force a language from parent
                    resource_dict['language'] = self.fields['language']
                    format = node.xpath("FORM[NAME='dataset_format_%d']/A/text()" % i)
                    size = node.xpath("FORM[NAME='dataset_size_%d']/A/text()" % i)
                  
                    if format:resource_dict['format']=format[0].split("|")[1]
                    self.resources.append(PilotResource(resource_dict,'dataset_link_en_'))
                else:               
                    break
     
            extras=['supplementary_documentation_en',
                    'supplementary_documentation_fr',
                    'data_dictionary_fr',
                    'dictionary_list:_en']
    
            
            for extra in extras:
                url= node.xpath("FORM[NAME='%s']/A/text()"% extra)
                if url:
                    resource_dict = {}
                    resource_dict['url']=url[0]    
                    self.resources.append(PilotResource(resource_dict,extra))
        except:
            raise