def test_trans_str(): assert trans_str("this is a test", "abcde", "ABCDE") == "this is A tEst" assert trans_str("中海油(天津)管道工程技术有限公司", "()", "()") \ == "中海油(天津)管道工程技术有限公司" assert trans_str("他说:“‘好’极了!”", "“”‘’:!", "\"\"'':!") \ == "他说:\"'好'极了!\""
def parse_applicants(self, applicants, address=None, include_org=False): """Parse applicant(s) and return types and states pairs. """ main_country, main_state = self.parse_address(address) results = [] for appl in re.split(";|;", applicants): kind = self.parse_applicant(appl) if kind is None: continue country, state = None, None if self.foreign_industry_re.search(appl): state = self.FOREIGN else: country, state = self.parse_address(appl) if state is None: if country is None: if kind == self.UNIVERSITY: country, state = self.parse_univ(appl) if state is None and main_country != self.MAINLAND: # assume no-chinese university is foregin state = self.FOREIGN elif country != self.MAINLAND: state = self.FOREIGN # last resort: state from address if state is None and main_country: if main_country == self.MAINLAND: state = main_state else: state = self.FOREIGN if state: if include_org: appl = appl.decode('utf8').strip(u" ").encode('utf8') appl = trans_str(appl, "() ", "() ") appl2 = self.main_org(appl) results.append((kind, state, appl, appl2)) else: results.append((kind, state)) if include_org: return results if len(results) > 1: # remove redundants results = list(set(results)) if len(results) == 1: results *= 2 else: results = sorted(results) return results, main_country, main_state
def parse_address(self, address): """Parse an address and return country and state """ if not address: return None, None country = None address = trans_str(address, "()“”‘’+- ", "()\"\"''+- ") stripped_addr = address.lstrip("0123456789() '\"`+-") if stripped_addr.startswith(self.CN): country = self.MAINLAND stripped_addr = stripped_addr[len(self.CN):] # check states in China... matched = self.cn_states_re.match(stripped_addr) if matched: return self.MAINLAND, matched.group(1) # check foreign countries... matched = self.foreign_re.match(stripped_addr) if matched: # don't care foreign country's states return matched.group(1), None # check cities in China... for state, cities in self.cn_state_city_map.iteritems(): for city in cities: if stripped_addr.startswith(city): return self.MAINLAND, state # check univerisities in China... if self.university_re2.search(stripped_addr): univ_result = self.parse_univ(stripped_addr) if univ_result[0]: return univ_result # check foreign states/cities... for foreign_country, cities in self.foreign_state_city_map.iteritems(): for city in cities: if stripped_addr.startswith(city): # don't care foreign state/city return foreign_country, None # check zip code zip_match = self.ZIP_PATTERN.match(address) if zip_match: try: zip_prefix = zip_match.group(1) return self.MAINLAND, self.mainland_zip_map[zip_prefix] except KeyError: pass # check string in parenthese paren_match = self.PAREN_PATTERN.search(address) if paren_match: return self.parse_address(paren_match.group(1)) if country is None and self.CN_ADDR_PATTERN.search(address): country = self.MAINLAND logger.warn("unrecognized address: {}".format(address)) return country, None