def contains_companies(person: str, companies) -> bool: if COMPANY_TYPES_RE.search(person): # noinspection PyTypeChecker for ant in nltk_re.get_companies(person): # type: CompanyAnnotation if ant.name == ant.company_type or ant.name == ant.description: continue return True for ant in companies: # Solving this scenario: This Amendment to Employment Agreement ("Amendment") is entered into # between Marsh Supermarkets, Inc. (the "Company"), and Don E. Marsh (the "Executive"). # because that is pretty common , even though it screws up this scenario # "This is an agreement between John Smith and John Smith, LLC" if person in ant.name: return True return False
def contains_companies(person:str, companies) -> bool: if COMPANY_TYPES_RE.search(person): for result in nltk_re.get_companies(person, detail_type=True, parse_name_abbr=True): co_name, co_type, co_type_abbr, co_type_label, co_desc, co_abbr = result if co_name == co_type or co_name == co_desc: continue return True for co_name, co_type in companies: # Solving this scenario: This Amendment to Employment Agreement ("Amendment") is entered into # between Marsh Supermarkets, Inc. (the "Company"), and Don E. Marsh (the "Executive"). # because that is pretty common , even though it screws up this scenario # "This is an agreement between John Smith and John Smith, LLC" if person in co_name: return True return False
def get_company_annotations( text: str, strict: bool = False, use_gnp: bool = False, count_unique: bool = False, name_upper: bool = False, ) -> Generator[CompanyAnnotation, None, None]: """ Find company names in text, optionally using the stricter article/prefix expression. :param parse_name_abbr: :param text: :param strict: :param use_gnp: use get_noun_phrases or NPExtractor :param name_upper: return company name in upper case. :param count_unique: return only unique companies - case insensitive. :return: """ # skip if all text is in uppercase if text == text.upper(): return valid_punctuation = VALID_PUNCTUATION + ["(", ")"] unique_companies = {} # type: Dict[Tuple[str, str], CompanyAnnotation] if COMPANY_TYPES_RE.search(text): # Iterate through sentences for s_start, s_end, sentence in get_sentence_span_list(text): # skip if whole phrase is in uppercase if sentence == sentence.upper(): continue if use_gnp: phrases = list( get_noun_phrases(sentence, strict=strict, valid_punctuation=valid_punctuation)) else: phrases = list(np_extractor.get_np(sentence)) phrase_spans = PhrasePositionFinder.find_phrase_in_source_text( sentence, phrases) for phrase, p_start, p_end in phrase_spans: if COMPANY_TYPES_RE.search(phrase): # noinspection PyTypeChecker for ant in nltk_re.get_companies( phrase, use_sentence_splitter=False ): # type: CompanyAnnotation if ant.name == ant.company_type or ant.name == ant.description: continue ant.coords = (ant.coords[0] + s_start + p_start, ant.coords[1] + s_start + p_start) if name_upper: ant.name = ant.name.upper() if count_unique: unique_key = (ant.name.lower() if ant.name else None, ant.company_type_abbr) existing_result = unique_companies.get(unique_key) if existing_result: existing_result.counter += 1 else: unique_companies[unique_key] = ant else: yield ant if count_unique: for company in unique_companies.values(): yield company
def get_companies(text: str, strict: bool = False, use_gnp: bool = False, detail_type: bool = False, count_unique: bool = False, name_upper: bool = False, parse_name_abbr: bool = False, return_source: bool = False): """ Find company names in text, optionally using the stricter article/prefix expression. :param text: :param strict: :param use_gnp: use get_noun_phrases or NPExtractor :param detail_type: return detailed type (type, unified type, label) vs type only :param name_upper: return company name in upper case. :param count_unique: return only unique companies - case insensitive. :param parse_name_abbr: return company abbreviated name if exists. :param return_source: :return: """ # skip if all text is in uppercase if text == text.upper(): return valid_punctuation = VALID_PUNCTUATION + ["(", ")"] unique_companies = dict() if COMPANY_TYPES_RE.search(text): # Iterate through sentences for sentence in get_sentence_list(text): # skip if whole phrase is in uppercase if sentence == sentence.upper(): continue if use_gnp: phrases = get_noun_phrases(sentence, strict=strict, valid_punctuation=valid_punctuation) else: phrases = np_extractor.get_np(sentence) for phrase in phrases: if COMPANY_TYPES_RE.search(phrase): for result in nltk_re.get_companies(phrase, detail_type=True, parse_name_abbr=True): co_name, co_type, co_type_abbr, co_type_label, co_desc, co_abbr = result if co_name == co_type or co_name == co_desc: continue if name_upper: co_name = co_name.upper() result = (co_name, co_type) if detail_type: result += (co_type_abbr, co_type_label, co_desc) if parse_name_abbr: result += (co_abbr,) if return_source and not count_unique: result = result + (sentence,) if count_unique: unique_key = (result[0].lower() if result[0] else None, co_type_abbr) existing_result = unique_companies.get(unique_key) if existing_result: unique_companies[unique_key] = existing_result[:-1] + (existing_result[-1] + 1,) else: unique_companies[unique_key] = result + (1,) else: yield result if count_unique: for company in unique_companies.values(): yield company