def process_row(self, row: OrderedDict, provider: Provider) -> None: d = None if self._debug: first_name = m(row, 'first_name', str) last_name = m(row, 'last_name', str) cred_string = m(row, 'license', str) d = "{:<30} {}".format(first_name + " " + last_name, cred_string) creds = CredentialParser(row['license'], d) if self._debug: self._credentials.append(creds) for degree in creds.valid_degrees: if degree in self._degree_map: provider.degrees.append(self._degree_map[degree]) else: print('WARNING: No degree record for', degree) for cred in creds.valid_credentials: if cred in self._credential_map: provider.credentials.append(self._credential_map[cred]) else: print('WARNING: no cred record for', cred) for modality_name in creds.modalities: record: Modality = self._session.query(Modality).filter_by( name=modality_name).options(load_only('id')).one_or_none() if not record: record = Modality(name=modality_name) self._session.add(record) provider.modalities.append(record)
def process_row(self, row: OrderedDict, provider: Provider) -> None: for number in m(row, 'license_number', str, "").split(";"): if number: self._do_license(number, provider) for number in m(row, 'certificate_number', str, "").split(";"): if number: self._do_cert(number, provider)
def merge(self, row: OrderedDict): ln: str = m(row, 'last_name', str, "") assert ln, "no last name" fn: str = m(row, 'first_name', str, "") fn = "".join(fn.replace(".", "").split()).lower() ln = "".join(ln.replace(".", "").split()).lower() if self.last_name: if ln != self.last_name: if not fn: raise Exception("No fn, differing last name") prospective_full_name = fn + ln if prospective_full_name not in self.full_names: raise Exception("differing last name merge") else: self.last_name = ln row_id: int = m(row, 'id', int) assert row_id, "no row id" self.ids.add(row_id) raw_addresses: str = m(row, 'address', str) if raw_addresses: for address in PhoneAddyMunger.parse_raw_address(raw_addresses): z = REDIS.hget(ZIP_HASH, address) if z: self.zips.add(z.decode()) cert_number = m(row, 'certificate_number', str) if cert_number: self.certificates.add(cert_number) license_number = m(row, 'license_number', str) if license_number: clean, code, is_nysop = LicenseCertMunger.clean_up_nysop_number( license_number) if is_nysop: if clean in self.licenses: self.licenses[clean].add(code) else: self.licenses[clean] = {code} directory_id: str = m(row, 'directory_id', str, None) if not directory_id: directory_id = m(row, 'payor_id', str, None) if not directory_id: raise Exception("XXX", "no directory or payor") self.directories.add(directory_id) if fn: if not self.first_name: self.first_name = fn self.first_initials.add(fn[:self.INITIAL_COUNT]) self.full_names.add(fn + self.last_name) # Parse credentials self.credentials = CredentialParser(row['license'], str(row_id)) self.rows.append(row)
def process_row(self, row: OrderedDict, provider: Provider) -> None: raw: str = m(row, 'accepted_payors', str) if not raw: return replaced_raw = raw.strip().lower() \ .replace(":", ' ') \ .replace(")", ' ') \ .replace("(", ' ') \ .replace('"', ' ') \ .replace("/", ';') \ .replace("out-of-network", 'oon') \ .replace("out of network", 'oon') \ .replace("oon -", 'oon ') \ .replace("oon-", 'oon ') \ .replace("oon", ";oon;") \ .replace(".", ';') \ .replace(",", ';') \ .replace("=", ' ') \ .replace("|", ';') \ .replace("*", ' ') \ .replace("&", ' and ') \ .replace("+", ' ') replaced_raw = OrientationMunger.MULTI_WHITESPACE_STRIP \ .sub(' ', replaced_raw) if not replaced_raw: return added = set() records = [] for token in replaced_raw.split(';'): token = token.strip() if not token: continue if token in added: continue apc: AcceptedPayorComment = self._session.query( AcceptedPayorComment).filter_by(body=token).options( load_only('id')).one_or_none() if not apc: apc = AcceptedPayorComment(body=token) self._session.add(apc) records.append(apc) added.add(token) already = {x for x in provider.accepted_payor_comments} for record in records: if record not in already: provider.accepted_payor_comments.append(record)
def process_row(self, row: OrderedDict, provider: Provider) -> None: apm = m(row, 'accepted_plan_ids', str) if not apm: return for plan_id in apm.split(';'): provider.plans_accepted.append(self._id_map[int(plan_id)])
def update_website(self, rows) -> None: query: text = text(""" SELECT website_url FROM monday.provider WHERE id = :id """) update_query: text = text(""" UPDATE monday.provider SET website_url = :val WHERE id = :id """) params = {'val': "", 'id': 0} updated = 0 i = 0 bar = progressbar.ProgressBar(max_value=len(rows), initial_value=i) for row in rows: url: str = m(row, 'website_url', str) if not url: i += 1 bar.update(i) continue # directory_id: int = m(row, 'directory_id', int) row_id = m(row, 'id', int) assert row_id, "there must be a row id" canonical_id: int = int(self._r.hget(ROW_ID_HASH, row_id)) for rrow in self._session.execute(query, {"id": canonical_id}): if not rrow['website_url']: params['val'] = url params['id'] = canonical_id self._session.execute(update_query, params) updated += 1 if updated % 250: self._session.commit() i += 1 bar.update(i) self._session.commit() print() print("Updated", updated, "rows.")
def process_row(self, row: OrderedDict, provider: Provider) -> None: raw: str = m(row, 'languages', str) if not raw: return found = {self._records['english']} replaced_raw = raw.lower() \ .replace(" and ", ";") \ .replace("bilingual", "") \ .replace("proficient", "") \ .replace("conversational", "") \ .replace("native", "") \ .replace("speaker", "") \ .replace("(", "") \ .replace(")", "") \ .replace("&", ";") \ .replace(":", ";") \ .replace("/", ";") for token in replaced_raw.split(';'): lang_name: str = token.strip() if not lang_name: continue if len(lang_name) < 2: continue r = self._process_token(lang_name) # This means we know this is a bad token if r is True: continue # Try one more strategy if r is False and lang_name.find('-') > -1: lang_name = lang_name.replace('-', '').strip() r = self._process_token(lang_name) if r is True: continue # Means we missed it if r is False: self._missed(lang_name, raw) else: found.add(r) if len(found) < 1: return already = {x for x in provider.languages} for lang in found: if lang not in already: provider.languages.append(lang)
def process_row(self, row: OrderedDict, provider: Provider) -> None: raw_address = m(row, 'address', str) raw_phone = m(row, 'phone', str) # A little hack directory_id: Union[int, None] = m(row, 'directory_id', int, None) if not directory_id: directory_id = None # Break up the addy and phones and insert them if raw_address: for address in self._cleanup_addresses(raw_address): address.directory_id = directory_id provider.addresses.append(address) if raw_phone: for number in self.cleanup_phone_numbers(raw_phone): number.directory_id = directory_id provider.phone_numbers.append(number)
def process_row(self, row: OrderedDict, provider: Provider) -> None: raw: str = m(row, 'works_with_ages', str) if not raw: return found_age_groups = set() ranges: Set[NumericRange] = set() for token in raw.lower().split(';'): token = token.strip() if not token: continue if token in self.AGE_NAMES: found_age_groups.add(token) if token.find("(") > -1: continue if token.find("(") < 0: continue inside = token[token.find("(") + 1:token.find(")")] # range cache hit if inside in self._range_cache: ranges.add(self._range_cache[inside]) continue sub_tokens = inside.split("to") # range cache miss if len(sub_tokens) > 1: val = NumericRange(int(sub_tokens[0]), int(sub_tokens[1]), bounds=self.RANGE_BOUNDS) self._range_cache[inside] = val ranges.add(val) continue sub_token = sub_tokens[0] if sub_token[-1:] == "+": val = NumericRange(int(sub_token[:-1]), 999, bounds=self.RANGE_BOUNDS) self._range_cache[inside] = val ranges.add(val) continue # Missed! self._missed.add(token) provider.age_groups = list(found_age_groups) provider.age_ranges = list(ranges)
def update_began_practice(self, rows) -> None: query: text = text(""" UPDATE monday.provider SET began_practice = :val WHERE id = :id """) current_year: int = datetime.datetime.now().year params: dict = {'val': None, 'id': None} updated = 0 i = 0 bar = progressbar.ProgressBar(max_value=len(rows), initial_value=i) for row in rows: yip: int = m(row, 'years_in_practice', int) if not yip or yip < 1: i += 1 bar.update(i) continue row_id = m(row, 'id', int) assert row_id, "there must be a row id" canonical_id: int = int(self._r.hget(ROW_ID_HASH, row_id)) params['val'] = current_year - yip params['id'] = canonical_id self._session.execute(query, params) updated += 1 if updated % 250: self._session.commit() i += 1 bar.update(i) self._session.commit() print() print("Updated", updated, "rows.")
def process_row(self, row: OrderedDict, provider: Provider) -> None: apm = m(row, 'accepted_payment_methods', str) if not apm: return for methods in apm.split(';'): method = methods.strip().lower() if not methods: print(method, row) continue provider.payment_methods.append(self._id_map[method])
def process_row(self, row: OrderedDict, provider: Provider) -> None: raw: str = m(row, 'works_with_groups', str) if not raw: return replaced_raw = raw.strip().lower() \ .replace(":", ' ') \ .replace(")", ' ') \ .replace("(", ' ') \ .replace('"', ' ') \ .replace("'", ' ') \ .replace("/", ';') \ .replace("&", ' and ') \ .replace("-", ' ') \ .replace(".", ' ') \ .replace(",", ' ') \ .replace("+", ' ') replaced_raw = OrientationMunger.MULTI_WHITESPACE_STRIP \ .sub(' ', replaced_raw) if not replaced_raw: return added = set() records = [] for token in replaced_raw.split(';'): token = token.strip() if not token: continue if token in added: continue group: Group = self._session.query(Group).filter_by( body=token).options(load_only('id')).one_or_none() if not group: group = Group(body=token) self._session.add(group) records.append(group) added.add(token) already = {x for x in provider.groups} for record in records: if record not in already: provider.groups.append(record)
def process_row(self, row: OrderedDict, provider: Provider) -> None: raw: str = m(row, 'treatment_orientations', str) if not raw: return replaced_raw = raw.strip().lower() \ .replace(":", ' ') \ .replace(")", ' ') \ .replace("(", ' ') \ .replace("/", ' ') \ .replace("&", ' and ') \ .replace("-", ' ') \ .replace(".", ' ') \ .replace(",", ' ') \ .replace("+", ' ') replaced_raw = self.MULTI_WHITESPACE_STRIP.sub(' ', replaced_raw) added = set() records = [] for token in replaced_raw.split(';'): token = token.strip() if not token: continue if token in added: continue orientation: Orientation = self._session.query( Orientation).filter_by(body=token).options( load_only('id')).one_or_none() if not orientation: orientation = Orientation(body=token) self._session.add(orientation) records.append(orientation) added.add(token) already = {x for x in provider.treatment_orientations} for record in records: if record not in already: provider.treatment_orientations.append(record)
def process_row(self, row: OrderedDict, provider: Provider) -> None: raw: str = m(row, 'modalities', str, "") # services from GT should be parsed in the same way as modality directory_id: int = m(row, "directory_id", int) if directory_id == 3: raw += " " + m(row, "services", str, "") if not raw: return replaced_raw = raw.strip().lower() \ .replace(":", ' ') \ .replace("&", ' and ') \ .replace(")", ' ') \ .replace("(", ' ') \ .replace('"', ' ') \ .replace("-", ' ') \ .replace(".", ' ') \ .replace(",", ' ') \ .replace("=", ' ') \ .replace("|", ' ') \ .replace("®", '') \ .replace('©', '') \ .replace('†', '') \ .replace("*", ' ') \ .replace("for ", ' ') \ .replace("rational emotive behavioral", 'rebt') \ .replace("rational emotive behavior", 'rebt') \ .replace("psychotherapies", "psychotherapy") \ .replace('therapy', ';therapy;') \ .replace('psychotherapy', ';psychotherapy;') \ .replace('psychology', ';psychology;') \ .replace('psychoanalysis', ';psychoanalysis;') \ .replace('couples', ';couples;') \ .replace('couple', ';couples;') \ .replace('/', ';') \ .replace("+", ' ') replaced_raw = OrientationMunger.MULTI_WHITESPACE_STRIP \ .sub(' ', replaced_raw) replaced_raw = self.PSYCHO_SUFFIX.sub(';psychotherapy;', replaced_raw) """ for token in replaced_raw.split(';'): token = token.strip() self._found.add(token) """ if not replaced_raw: return added = set() records = [] for token in replaced_raw.split(';'): token = token.strip() if not token: continue if token in added: continue modality: Modality = self._session.query(Modality).filter_by( name=token).options(load_only('id')).one_or_none() if not modality: modality = Modality(name=token) self._session.add(modality) records.append(modality) added.add(token) already = {x for x in provider.modalities} for record in records: if record not in already: provider.modalities.append(record)
def process_row(self, row: OrderedDict, provider: Provider) -> None: raw = m(row, 'specialties', str) if not raw: return # clean it up processed = raw.lower().replace("--", " ") \ .replace("(", "") \ .replace(")", "") \ .replace("'", "") \ .replace('"', '') \ .replace('.', '') \ .replace(",", ";") found = set() for token in processed.split(';'): token = token.strip() # Edge case, no token if not token: continue if len(token) < 3: continue # Edge case, we already know there are no specialties for this str if token in self._unknown_keys: continue # Have we never encountered this string before? if token not in self._cache: detected_specialties = set() # If not, test it against all regexes for pattern, specialty in COMPILED_REGEXPS: s_record = self._records[specialty] # If we've already added this specialty dont bother matching if s_record in detected_specialties: continue if pattern.search(token): # If it matches record that fact in the cache detected_specialties.add(s_record) # Save to the cache to avoid doing this again self._cache[token] = detected_specialties # If we detected nothing, continue on if len(detected_specialties) == 0: self._unknown_keys.add(token) continue else: self._cache_hits += 1 detected_specialties = self._cache[token] # Save these as having been detected for this provider found.update(detected_specialties) # Edge case: nothing found if len(found) == 0: return already = {x for x in provider.specialties} # Reconcile by doing set disjunction for record in found: if record not in already: provider.specialties.append(record)
def process_providers(self, tables: Mapping[str, RawTable], update_columns: bool) -> None: for plugin in self._plugins: plugin.pre_process() table = tables['provider_records'] columns, rows = table.get_table_components() directories: MutableMapping[int, Set[int]] = {} current_year: int = datetime.datetime.now().year i = 0 bar = progressbar.ProgressBar(max_value=len(rows), initial_value=i) for row in rows: row_id = m(row, 'id', int) assert row_id, "there must be a row id" directory_id: Union[int, None] = m(row, 'directory_id', int) canonical_id: int = int(self._r.hget(ROW_ID_HASH, row_id)) # Does this provider exist? provider: Provider = self._session.query(Provider).filter_by( id=canonical_id).one_or_none() if not provider or update_columns: dirs: Union[int, None] = directories.get(canonical_id, None) args = {} for row_name, row_params in self.ROW_FIELDS.items(): coercer, priorities = row_params # @TODO: NOTE!! If a higher priority row had no value, lower # @TODO: priority rows WITH values will skip!! I am highly # @TODO: suspicious that this will work for all cases!! if dirs: skip = False for priority in priorities: # Are we this priority? if directory_id == priority: break # Do we already have a higher priority? if priority in dirs: skip = True continue if skip: continue # Get the value coerced_value = m(row, row_name, coercer) # This check is important because we want fields that are # set to null to not overwrite existing fields from other # record sources if coerced_value is not None: args[row_name] = coerced_value # Regardless of the outcome, we can still merge a new record args['id'] = canonical_id provider: Provider = Provider(**args) provider = self._session.merge(provider) # A special case yip: int = m(row, 'years_in_practice', int) if not yip or yip < 1: provider.began_practice = current_year - yip # Relate the provider to the directory if directory_id and directory_id in self._directory_map: found = False for directory in provider.directories: if directory.id == directory_id: found = True break if not found: provider.directories.append( self._directory_map[directory_id]) # Do all the plugins for plugin in self._plugins: plugin.process_row(row, provider) # Save the directories processed for this canonical ID so that when # we find another one we can evaluate priority if canonical_id not in directories: directories[canonical_id] = {directory_id} else: directories[canonical_id].add(directory_id) self._session.commit() i += 1 bar.update(i) self._session.flush() for plugin in self._plugins: plugin.post_process()