def parse_text(self, doc, **kwargs): # CUSIP cusip = None cusip_pattern = r"\w{4,6}(\s*|-*)\w{2,3}-*\w{0,1}" pat = r"(" + cusip_pattern + r")\W+\(CUSIP Number\)" match = re.compile(pat, re.IGNORECASE | re.MULTILINE).search(doc) if match: cusip = match.group(1) else: pat = r"cusip (no|number|num)\.*:*\W*(\w{6}\W*\w{3})" match = re.compile(pat, re.IGNORECASE | re.MULTILINE).search(doc) if match: cusip = match.group(2) else: logging.warning("No match for cusip") # Address address = parse_address(doc) # Issue issue_name = None pat = re.compile( r"\(Name\s+of\s+Issuer\)([\w\W]+?)-*\s+\(Title\s+of\s+Class\s+of\s+Securities\)", re.IGNORECASE | re.MULTILINE, ) match = pat.search(doc) if match: issue_name = match.group(1).strip() else: pat = re.compile( r"Title\s+of\s+Class\s+of\s+Securities:\s+([\w\W]+?)\s+(Item 2\(e\)\.*)?\s+CUSIP\s+Number:", re.IGNORECASE | re.MULTILINE, ) match = pat.search(doc) if match: issue_name = match.group(1).strip() else: logging.warning("No match for issue name") # Issuer issuer_name = None pat = re.compile( r"\(Amendment\s+No\.*:*\s+[\w\W]*?\)\*?\s+(Under\s+the\s+Securities\s+Exchange\s+Act\s+of\s+1934)?([\w\W]+?)-*\(Name\s+of\s+Issuer\)", re.IGNORECASE | re.MULTILINE, ) match = pat.search(doc) if match: issuer_name = match.group(2).strip() match = re.match(r"(.*?)\s*-+$", issuer_name) if match: issuer_name = match.group(1).strip() else: pat = re.compile(r"Item\s+1\(a\)\.*\s*-*\s*Name\s+of\s+Issuer:([\w\W]+?)Item", re.IGNORECASE | re.MULTILINE) match = pat.search(doc) if match: issuer_name = match.group(1).strip() else: logging.warning("No match for issuer name in SC13G/A") return {"cusip": cusip, "address": address, "issue_name": issue_name, "issuer_name": issuer_name}
def parse_text(self, text, **kwargs): cusip_number = self.extract_cusip(text) address = parse_address(text) # Issue name issue_name = self.extract_issue_name(text) # Issuer name issuer_name = self.extract_issuer_name(text) if not issuer_name: logging.warning("No match for issuer name in SC13G/A") return {'cusip': cusip_number, 'address': address, 'issue_name' : issue_name, 'issuer_name' : issuer_name}
def parse_text(self, text, **kwargs): cusip_number = self.extract_cusip(text) address = parse_address(text) # Issue name issue_name = self.extract_issue_name(text) # Issuer name issuer_name = self.extract_issuer_name(text) if not issuer_name: logging.warning("No match for issuer name in SC13G/A") return { 'cusip': cusip_number, 'address': address, 'issue_name': issue_name, 'issuer_name': issuer_name }
def parse_text(self, doc, **kwargs): # CUSIP cusip = None cusip_pattern = r'\w{4,6}(\s*|-*)\w{2,3}-*\w{0,1}' pat = r'(' + cusip_pattern + r')\W+\(CUSIP Number\)' match = re.compile(pat, re.IGNORECASE | re.MULTILINE).search(doc) if match: cusip = match.group(1) else: pat = r'cusip (no|number|num)\.*:*\W*(\w{6}\W*\w{3})' match = re.compile(pat, re.IGNORECASE | re.MULTILINE).search(doc) if match: cusip = match.group(2) else: logging.warning("No match for cusip") # Address address = parse_address(doc) # Issue issue_name = None pat = re.compile( r"\(Name\s+of\s+Issuer\)([\w\W]+?)-*\s+\(Title\s+of\s+Class\s+of\s+Securities\)", re.IGNORECASE | re.MULTILINE) match = pat.search(doc) if match: issue_name = match.group(1).strip() else: pat = re.compile( r'Title\s+of\s+Class\s+of\s+Securities:\s+([\w\W]+?)\s+(Item 2\(e\)\.*)?\s+CUSIP\s+Number:', re.IGNORECASE | re.MULTILINE) match = pat.search(doc) if match: issue_name = match.group(1).strip() else: logging.warning("No match for issue name") # Issuer issuer_name = None pat = re.compile( r"\(Amendment\s+No\.*:*\s+[\w\W]*?\)\*?\s+(Under\s+the\s+Securities\s+Exchange\s+Act\s+of\s+1934)?([\w\W]+?)-*\(Name\s+of\s+Issuer\)", re.IGNORECASE | re.MULTILINE) match = pat.search(doc) if match: issuer_name = match.group(2).strip() match = re.match(r'(.*?)\s*-+$', issuer_name) if match: issuer_name = match.group(1).strip() else: pat = re.compile( r'Item\s+1\(a\)\.*\s*-*\s*Name\s+of\s+Issuer:([\w\W]+?)Item', re.IGNORECASE | re.MULTILINE) match = pat.search(doc) if match: issuer_name = match.group(1).strip() else: logging.warning("No match for issuer name in SC13G/A") return { 'cusip': cusip, 'address': address, 'issue_name': issue_name, 'issuer_name': issuer_name }