def _parse_registry_block(self, registry_txt): business = reg.Business() lines = registry_txt.split('\n') registry_txt = registry_txt.replace('\n', '') # Look for name match in first line. name_match = re.match(self.name_pattern_1, lines[0]) if not name_match: name_match = re.match(self.name_pattern_2, lines[0]) if name_match: business.name = name_match.group(0) registry_txt = re.sub(re.escape(business.name), '', registry_txt) else: # Set to entire first line if no match found. business.name = lines[0] # Find address and bracket matches. address_match = re.search(self.address_pattern, registry_txt) if address_match: business.address = address_match.group(1) business.bracket = address_match.group(2) registry_txt = re.sub(re.escape(address_match.group(0)), '', registry_txt) # Find SIC matches. sic_matches = self.sic_pattern.findall(registry_txt) for desc, num in sic_matches: business.category.append(num) business.cat_desc.append(desc) # Append the current city. business.city = self.current_city return business
def _parse_registry_block(self, registry_txt): """works for registries from 1975-onward""" business = reg.Business() lines = registry_txt.split("\n") business.name = lines[0] business.address = lines[1] match = self.city_pattern.search(registry_txt) if match: city = match.group(0) match_city = self._city_detector.match_to_cities(city) # perform spell check and confirm this is a city if match_city: if match_city != city: print("Imperfect city match: %s matched to %s" % (city, match_city)) business.city = match_city match = self.emp_pattern.search(registry_txt) if match: match = re.search(r"\d+",match.group(0)) if match: business.emp = match.group(0) return business
def _process_contour(self, contour_txt, contour_font_attrs, header_str): if contour_txt.count("\n") > 0: # if the contour's text has 2 or more lines consider it a registry business = self._parse_registry_block(contour_txt) business.category = header_str if len(self.current_city) > 0: business.city = self.current_city if len(self.current_zip) > 0: business.zip = self.current_zip geo.geocode_business(business) return business else: # check if city header segments = contour_txt.rpartition(" ") zip = "" # check if zip is in header if segments[2].isdigit() and len(segments[2]) == 5: zip = segments[2] contour_txt = segments[0] match_city = self._city_detector.match_to_cities(contour_txt) if match_city: self.current_city = match_city self.current_zip = zip return reg.Business()
def _parse_registry_block(self, registry_txt): business = reg.Business() lines = registry_txt.split('\n') business.name = lines[0] full_address = "" for line in lines: start = re.search('[0-9]{2,}', line) end = self.phone_pattern.search(line) if start: if end: break full_address += ' ' + line match = self.paren_pattern.search(full_address) if match: business.address = match.group(1) match = self.bad_address_pattern.search(full_address) if match: business.city = match.group(1) business.zip = match.group(2) else: match = self.good_address_pattern.search(full_address) if match: business.address = match.group(1) business.city = match.group(2) business.zip = match.group(3) matches = self.sic_pattern.findall(registry_txt) category_pattern = re.compile(r'\d{4}') cat_desc_pattern = re.compile(r'[^\:0-9\n]+[\n]*[^0-9\:]*') one_sic_pattern = re.compile(r'(/d{4}):[/s]+(.*)', re.DOTALL) if len(matches) > 0: business.category = category_pattern.findall(matches[0]) business.cat_desc = cat_desc_pattern.findall(matches[0]) else: match = one_sic_pattern.search(registry_txt) if match: business.category = match.group(1) business.cat_desc = match.group(2) else: match = one_sic_pattern.search(registry_txt) if match: business.category = match.group(1) business.cat_desc = match.group(2) match = self.emp_pattern.search(registry_txt) if match: business.emp = match.group(1) match = self.sales_pattern.search(registry_txt) if match: business.sales = match.group(1) return business
def _process_contour(self, contour_txt, contour_font_attrs): registry_match = self.registry_pattern.search(contour_txt) city_match = self.city_pattern.search(contour_txt) if registry_match: return self._parse_registry_block(contour_txt) elif city_match: self.current_city = city_match.group(1) return reg.Business()
def _process_contour(self, contour_txt, contour_font_attrs): registry_match = self.registry_pattern.match(contour_txt) sic_match = self.sic_pattern.match(contour_txt) if registry_match and not sic_match: lines = contour_txt.split("\n") if len(lines) < 2: return reg.Business() lines[0] = self._start(self.name_prefix) + " " + lines[0] + " " + self._end(self.name_prefix) lines[1] = self._start(self.address_prefix) + " " + lines[1] + " " + self._end(self.address_prefix) self.registry_txt += "\n" + self._start(self.bus_prefix) + "\n" self.registry_txt += " ".join(lines) self.registry_txt += "\n" + self._end(self.bus_prefix) + "\n" return reg.Business()
def _process_contour(self, contour_txt, contour_font_attrs): registry_match = self.registry_pattern.match(contour_txt) sic_match = self.sic_pattern.match(contour_txt) if registry_match: business = self._parse_registry_block(contour_txt) business.category = self.current_sic geo.geocode_business(business) return business elif sic_match: self.current_sic = sic_match.group(0) return reg.Business()
def _parse_registry_block(self, registry_txt): """works for registries from 2005""" business = reg.Business() lines = registry_txt.split('\n') business.name = lines[0] # Get address lines full_address = "" for line in lines: start = re.search(r'[0-9]+', line) end = re.search(r'Phone', line) if start: if end: break full_address += line # Get category description lines cat_desc = "" for line in lines: end = re.search(r'Employs', line) if end: break else: cat_desc += line # Search for regex pattern address_match = self.address_pattern.search(full_address) if address_match: business.address = address_match.group(1) business.zip = address_match.group(2) cat_desc_match = self.cat_desc_pattern.search(cat_desc) if cat_desc_match: business.cat_desc = cat_desc_match.group(1) sic_match = self.sic_pattern.search(registry_txt) if sic_match: business.category = sic_match.group(1) emp_match = self.emp_pattern.search(registry_txt) if emp_match: business.emp = emp_match.group(1) sales_match = self.sales_pattern.search(registry_txt) if sales_match: business.sales = sales_match.group(1) return business
def _parse_registry_block(self, registry_txt): business = reg.Business() lines = registry_txt.split('\n') # Set first line as business name. business.name = lines[0] # Delete lines that list managers/presidents/administrators. man_pattern = re.compile(r':\s([A-Za-z \t\r\f\v]+)') man_matches = man_pattern.findall(registry_txt) for match in man_matches: registry_txt = registry_txt.replace(match, '') # Find address match. address_match = re.search(self.address_pattern, registry_txt) if address_match: business.address = address_match.group(1) zip_match = re.search(self.zip_pattern, registry_txt) if zip_match: business.zip = zip_match.group(1) # Delete newline markers. registry_txt = registry_txt.replace('\n', '') # Find SIC matches. sic_matches = self.sic_pattern.findall(registry_txt) for desc, num in sic_matches: business.category.append(num) business.cat_desc.append(desc) # Find bracket match. bracket_match = re.search(self.bracket_pattern, registry_txt) if bracket_match: business.bracket = bracket_match.group(1) # Set business.city business.city = self.current_city return business
def _parse_registry_block(self, registry_txt): """works for registries from 1953-1975""" business = reg.Business() lines = registry_txt.split("\n") # get name business.name = lines[0] address_line = lines[1] match = self.zip_pattern.search(address_line) if match: business.zip = match.group("zip") address_line = match.group("address") business.address = address_line match = self.emp_pattern.search(registry_txt) if match: business.emp = match.group(0)[-1] return business