Example #1
0
def test_trans_str():
    assert trans_str("this is a test", "abcde", "ABCDE") == "this is A tEst"

    assert trans_str("中海油(天津)管道工程技术有限公司", "()", "()") \
        == "中海油(天津)管道工程技术有限公司"

    assert trans_str("他说:“‘好’极了!”", "“”‘’:!", "\"\"'':!") \
        == "他说:\"'好'极了!\""
Example #2
0
def test_trans_str():
    assert trans_str("this is a test", "abcde", "ABCDE") == "this is A tEst"

    assert trans_str("中海油(天津)管道工程技术有限公司", "()", "()") \
        == "中海油(天津)管道工程技术有限公司"

    assert trans_str("他说:“‘好’极了!”", "“”‘’:!", "\"\"'':!") \
        == "他说:\"'好'极了!\""
Example #3
0
    def parse_applicants(self, applicants, address=None, include_org=False):
        """Parse applicant(s) and return types and states pairs.
        """
        main_country, main_state = self.parse_address(address)
        results = []
        for appl in re.split(";|;", applicants):
            kind = self.parse_applicant(appl)
            if kind is None:
                continue

            country, state = None, None
            if self.foreign_industry_re.search(appl):
                state = self.FOREIGN
            else:
                country, state = self.parse_address(appl)
            if state is None:
                if country is None:
                    if kind == self.UNIVERSITY:
                        country, state = self.parse_univ(appl)
                        if state is None and main_country != self.MAINLAND:
                            # assume no-chinese university is foregin
                            state = self.FOREIGN
                elif country != self.MAINLAND:
                    state = self.FOREIGN

            # last resort: state from address
            if state is None and main_country:
                if main_country == self.MAINLAND:
                    state = main_state
                else:
                    state = self.FOREIGN
            if state:
                if include_org:
                    appl = appl.decode('utf8').strip(u"   ").encode('utf8')
                    appl = trans_str(appl, "()  ", "()  ")
                    appl2 = self.main_org(appl)
                    results.append((kind, state, appl, appl2))
                else:
                    results.append((kind, state))

        if include_org:
            return results

        if len(results) > 1:
            # remove redundants
            results = list(set(results))
            if len(results) == 1:
                results *= 2
            else:
                results = sorted(results)

        return results, main_country, main_state
Example #4
0
    def parse_applicants(self, applicants, address=None, include_org=False):
        """Parse applicant(s) and return types and states pairs.
        """
        main_country, main_state = self.parse_address(address)
        results = []
        for appl in re.split(";|;", applicants):
            kind = self.parse_applicant(appl)
            if kind is None:
                continue

            country, state = None, None
            if self.foreign_industry_re.search(appl):
                state = self.FOREIGN
            else:
                country, state = self.parse_address(appl)
            if state is None:
                if country is None:
                    if kind == self.UNIVERSITY:
                        country, state = self.parse_univ(appl)
                        if state is None and main_country != self.MAINLAND:
                            # assume no-chinese university is foregin
                            state = self.FOREIGN
                elif country != self.MAINLAND:
                    state = self.FOREIGN

            # last resort: state from address
            if state is None and main_country:
                if main_country == self.MAINLAND:
                    state = main_state
                else:
                    state = self.FOREIGN
            if state:
                if include_org:
                    appl = appl.decode('utf8').strip(u"   ").encode('utf8')
                    appl = trans_str(appl, "()  ", "()  ")
                    appl2 = self.main_org(appl)
                    results.append((kind, state, appl, appl2))
                else:
                    results.append((kind, state))

        if include_org:
            return results

        if len(results) > 1:
            # remove redundants
            results = list(set(results))
            if len(results) == 1:
                results *= 2
            else:
                results = sorted(results)

        return results, main_country, main_state
Example #5
0
    def parse_address(self, address):
        """Parse an address and return country and state
        """
        if not address:
            return None, None

        country = None
        address = trans_str(address, "()“”‘’+-  ", "()\"\"''+-  ")
        stripped_addr = address.lstrip("0123456789() '\"`+-")
        if stripped_addr.startswith(self.CN):
            country = self.MAINLAND
            stripped_addr = stripped_addr[len(self.CN):]

        # check states in China...
        matched = self.cn_states_re.match(stripped_addr)
        if matched:
            return self.MAINLAND, matched.group(1)

        # check foreign countries...
        matched = self.foreign_re.match(stripped_addr)
        if matched:
            # don't care foreign country's states
            return matched.group(1), None

        # check cities in China...
        for state, cities in self.cn_state_city_map.iteritems():
            for city in cities:
                if stripped_addr.startswith(city):
                    return self.MAINLAND, state

        # check univerisities in China...
        if self.university_re2.search(stripped_addr):
            univ_result = self.parse_univ(stripped_addr)
            if univ_result[0]:
                return univ_result

        # check foreign states/cities...
        for foreign_country, cities in self.foreign_state_city_map.iteritems():
            for city in cities:
                if stripped_addr.startswith(city):
                    # don't care foreign state/city
                    return foreign_country, None

        # check zip code
        zip_match = self.ZIP_PATTERN.match(address)
        if zip_match:
            try:
                zip_prefix = zip_match.group(1)
                return self.MAINLAND, self.mainland_zip_map[zip_prefix]
            except KeyError:
                pass

        # check string in parenthese
        paren_match = self.PAREN_PATTERN.search(address)
        if paren_match:
            return self.parse_address(paren_match.group(1))

        if country is None and self.CN_ADDR_PATTERN.search(address):
            country = self.MAINLAND

        logger.warn("unrecognized address: {}".format(address))
        return country, None