def check_language(file): ''' test to make sure all not_in_new files crept into jl files becaues of order problem because they are actually french (no french ids should be in the .jl file) ''' not_in_new = pickle.load(open('not_in_new.pkl','rb')) final = etree.parse(file).getroot() for i,child in enumerate(final): try: formid = str(child.xpath("FORM[NAME='thisformid']/A/text()")[0]).lower() if formid in not_in_new: print common.language(child) except IndexError: print "SMALL NO FORM ID", child.xpath("FORM[NAME='thisformid']/A/text()") #raise print "Conlusion, all records have french primary ids, and thus must be removed"
def process_doubles(self, datafile): tree = etree.parse(datafile) root = tree.getroot() for i,pair in enumerate(self.combined_elements(root)): node_en = pair[0] lang = common.language(node_en) if lang == "English": pass #continue # MAKE SURE It's english and that the order in pilot-matched.xml has not been broken node_fr = pair[1] package_en = Transform().process_node(i,node_en, "eng; CAN") package_fr = Transform().process_node(i,node_fr,u"fra; CAN") # Transfer French Data to English Package for pack in package_fr['resources']: if pack['format'] != "HTML": package_en['resources'].append(pack) if package_en['resources'] == []: raise Exception if not package_en['owner_org']: print "############### NO ORGANIZATION ###########", package_en['id'] elif not package_en['title']: print "############### NO TITLE ###########", package_en['id'] elif not package_en['id']: "############ NO ID ###########",package_en['id'] else: print i, "OK",package_en['id'] print package_en['title'] print package_en['title_fra'] #if i>80:sys.exit() if package_fr['id'] == "dafb6413-5dab-45ca-bcd2-8c6ff4b67be5": print "STOP" self.outfile.write(json.dumps(package_en) + "\n")
def find_missing_bilingual(file1,file2): cnt=Counter() tree1 = etree.parse(file1) tree2 = etree.parse(file2) root1 = tree1.getroot() root2 = tree2.getroot() ids_bi=[] ids_source=[] for i,child in enumerate(root1): try: formid = str(child.xpath("FORM[NAME='thisformid']/A/text()")[0]).lower() except: print "SMALL NO FORM ID" try: lang=language(child) if lang=="Bilingual": ids_bi.append(formid) except: print "small NO Language" for i,child in enumerate(root2): try: formid = str(child.xpath("FORM[NAME='thisformid']/A/text()")[0]).lower() except: print "BIG NO FORM ID" print child.find("DC.TITLE").text print child.xpath("FORM[NAME='title_en']/A/text()") #print etree.tostring(child) try: lang=common.language(child) if lang=="Bilingual": ids_source.append(formid) except: print "small NO Language" biset= set(ids_bi) fullset = set(ids_source) print biset.issubset(fullset) diff = fullset.difference(biset) print diff
def _parse_fields(self,node): ''' package fields ''' for ckan_name, pilot_name, field in schema_description.dataset_all_fields(): if pilot_name: path = "FORM[NAME='%s']/A/text()"%pilot_name element = node.xpath(path) try: # Deal with Pilot UUID CODES value = element[0].strip() if "|" in value: split_value=value.split("|")[1] self.fields[pilot_name] = field['choices_by_pilot_uuid'][split_value]['key'] else: self.fields[pilot_name]=value except IndexError: self.fields[pilot_name]="" else: # ckan_name / field does not belong at PilotRecord level. Process in CanadaRecord pass ''' Grab data that is not defined in schema ''' self.fields['language'] = common.language(node) geo_lower_left = node.xpath("FORM[NAME='geo_lower_left']/A/text()") geo_upper_right = node.xpath("FORM[NAME='geo_upper_right']/A/text()") if geo_lower_left and geo_upper_right and geo_lower_left[0] != "N/A": print "GEO ", geo_upper_right, geo_lower_left try: left,bottom = geo_lower_left[0].split(" ") right, top = geo_upper_right[0].split(" ") except ValueError: ''' To catch values that have a dash that should perhaps be a minus ['84 - 43'] ['41.5 - 141'] ''' left,bottom = geo_lower_left[0].replace(" - "," -").split(" -") right, top = geo_upper_right[0].replace(" - "," -").split(" -") coordinates = [[left, bottom], [left,top], [right, top], [right, bottom]] self.fields['spatial']= {'type': 'Polygon', 'coordinates': coordinates} ''' resources''' try: for i in range(1,5): url = node.xpath("FORM[NAME='dataset_link_en_%d']/A/text()" % i) if url: resource_dict = {} resource_dict['url']=url[0] if "http://data.gc.ca/commonwebsol/fileuploads/C/4/0/C4060F22-17EB-450D-9B5E-A1216E75DF47/Dictionnaire" in resource_dict['url']: print "STOP" # Force a language from parent resource_dict['language'] = self.fields['language'] format = node.xpath("FORM[NAME='dataset_format_%d']/A/text()" % i) size = node.xpath("FORM[NAME='dataset_size_%d']/A/text()" % i) if format:resource_dict['format']=format[0].split("|")[1] self.resources.append(PilotResource(resource_dict,'dataset_link_en_')) else: break extras=['supplementary_documentation_en', 'supplementary_documentation_fr', 'data_dictionary_fr', 'dictionary_list:_en'] for extra in extras: url= node.xpath("FORM[NAME='%s']/A/text()"% extra) if url: resource_dict = {} resource_dict['url']=url[0] self.resources.append(PilotResource(resource_dict,extra)) except: raise