def splitAddress(self):
     address = self.policy_data["Mailing_Address_Full"]
     if address != None:
         result = pyap.parse(address, country = 'CA')
         if len(result) > 0:
             r = result[0]
             address_data = r.as_dict()
             if address_data != None:
                 self.policy_data["Mailing_Address_Full"] = address_data['full_address']
                 self.policy_data["Mailing_Address_Unit"] = address_data['street_number']
                 self.policy_data["Mailing_Address_Street"] = address_data['street_name']
                 self.policy_data["Mailing_Address_Province"] = address_data['region1']
                 self.policy_data["Mailing_Address_PostalCode"] = address_data['postal_code']
                 self.policy_data["Mailing_Address_City"] = address_data['city']
     for location in self.locations:
         address = None
         if location.data["Location_Of_Insured_Property"] != None:
             try:
                 address = location.data["Location_Of_Insured_Property"].split(":")[1]
                 result = pyap.parse(address, country = 'CA')
                 if len(result) > 0:
                     r = result[0]
                     address_data = r.as_dict()
                     if address_data != None:
                         location.data["Physical_Address_Full"] = address_data['full_address']
                         location.data["Physical_Address_Unit"] = address_data['street_number']
                         location.data["Physical_Address_Street"] = address_data['street_name']
                         location.data["Physical_Address_Province"] = address_data['region1']
                         location.data["Physical_Address_PostalCode"] = address_data['postal_code']
                         location.data["Physical_Address_City"] = address_data['city']
             except IndexError:
                 print("Error Parsing Address")
def address(text):
    """
    Extracts the address information

    text: input text from where address to be extracted
    """
    if pyap.parse(re.sub(r',|#\d+', "", text), country='US'):

        return pyap.parse(re.sub(r',|#\d+', "", text), country='US')[0]
    else:

        return None
Example #3
0
def find_info(value_matrix, sheet):
    phone_string = ""
    data_matrix = [[]]
    data_matrix.clear()

    for v in value_matrix:

        # Provider name from matrix
        provider_string = v[0]

        try:
            add_string = pyap.parse(v[1], country='US')[0].__str__(
            )  # Find address from matrix value index 1
        except:
            add_string = ""  # Expect error when no address found. Make value "" to add to matrix

        try:
            web_string = URLExtract().find_urls(
                v[1])[0]  # Find URL from matrix value index 1
        except:
            web_string = ""  # Expect error when no URL found. Make value "" to add to matrix

        for match in phonenumbers.PhoneNumberMatcher(
                v[1], "US"):  # Find phone number from matrix value index 1
            phone_string = phonenumbers.format_number(
                match.number, phonenumbers.PhoneNumberFormat.NATIONAL)

        data_matrix.append(
            [provider_string, phone_string, web_string, add_string])
        phone_string = ""

    sheet.update_values("C:H",
                        data_matrix)  # Update cell range with found values
    def _extract_variants_from_text(self, field, text: str):
        addresses = list(pyap.parse(text, country='US'))

        if not addresses:
            addresses = list(get_addresses(text))

        return [AddressField._get_from_geocode(address) for address in addresses]
Example #5
0
def test_parse_address():
    ap = parser.AddressParser(country='US')
    result = ap.parse('No address here')
    assert not result

    ap = parser.AddressParser(country='US')
    result = ap._parse_address('No address here')
    assert not result

    ap = parser.AddressParser(country='US')
    test_address = "xxx 225 E. John Carpenter Freeway, " +\
        "Suite 1500 Irving, Texas 75062 xxx"

    addresses = ap.parse(test_address)
    assert addresses[0].full_address == \
        "225 E. John Carpenter Freeway, Suite 1500 Irving, Texas 75062"
 def Address_Search(self, test_address):  # NOT in use
     addresses = pyap.parse(test_address, country='US')
     for address in addresses:
         # shows found address
         print(address)
         # shows address parts
         print(address.as_dict())
Example #7
0
        def find_address(responce, contact_link):
            soup = BeautifulSoup(response.body, 'lxml')
            for script in soup(["script", "style"]):
                script.extract()
            #text = soup.get_text().split('\n')
            try:
                address = str(pyap.parse(soup.text, country='US')[0])

            except Exception as e:
                #print(e)
                address = None
            try:
                assert len(contact_link) > 0
                if len(address) == 0:
                    request = SeleniumRequest(url=contact_link, callback=find_address, meta={'splash': {'endpoint': 'render.html', 
                                                                                                    'args': {'html': 1,
                                                                                                            'png': 1,
                                                                                                            'width': 600,
                                                                                                            'render_all': 1,
                                                                                                            'wait': 0.5}}})
            except Exception as e:
                #print(e)
                pass


            return address
Example #8
0
def get_string(text):
    with open(text, 'r') as file:
        data = file.read().replace('\n', '')
    addresses = pyap.parse(data, country='US')
    print(addresses)

    return addresses
Example #9
0
    def parse_url(self, response):

        # get all text on page as plain text
        text = ''.join(response.xpath('//body//text()').extract())
        key = response.meta.get('key')

        # url might not be here if usr is passing in plain file
        if key not in self.data:
            self.data[key] = {
                'url': key,
                'name': [],
                'phone': [],
                'address': []
            }

        # parse out address
        for addr in pyap.parse(text, country='US'):
            self.data[key]['address'].append(addr.as_dict())

        # https://stackoverflow.com/questions/34527917/extracting-phone-numbers-from-a-free-form-text-in-python-by-using-regex
        for phone in re.finditer(
                '\(?\b[2-9][0-9]{2}\)?[-. ]?[2-9][0-9]{2}[-. ]?[0-9]{4}\b',
                text):
            print(phone)
            self.data[key]['phone'].append(phone)
Example #10
0
def location(address):

    try:
        addresses = pyap.parse(address, country='US')
        return addresses[0]

    except Exception:
        pass
def listAddresses(myText):
    addresses = pyap.parse(myText, country='US')
    myList = []
    for address in addresses:
        tok = nltk.word_tokenize(str(address))
        for t in tok:
            myList.append(t)
    return myList
Example #12
0
def address_extract(text):
	addresses = pyap.parse(text, country='US')

	# for address in addresses:
	# 	print(address)
	# 	print(address.as_dict())

	return addresses
Example #13
0
 def addressFinder(self, string):
     try:
         found_address = str(pyap.parse(string, country='US')[0])
         g = geocoder.google(found_address)
         address = g.housenumber + " " + g.street
         return address, g.city, g.lat, g.lng
     except:
         return "", "", "", ""
Example #14
0
def addr(P):
    text3 = P
    AddC = 0
    Address = ap.parse(text3, country='US')
    #print(Address)
    for i in Address:
        text3 = text3.replace(str(i), "█" * len(str(i)))
        AddC = AddC + 1
    return text3, AddC
Example #15
0
def pyapGetEvent_Locations(searchString):
  addresses = pyap.parse(searchString.encode("utf-8"), country='US')
  event_locations = {"numLocations":len(addresses), "addresses":[]}
  #"searchString":searchString,
  for address in addresses:
    addDict = address.as_dict()
    event_locations["addresses"].append(addDict)

  return event_locations
def test_full_address_parts():
    """Tests that the right parts of the address are picked up by the right regex"""
    example_addresses = [
        {
            'full_address': '9 Shaun glen, East Joan, LN4 1LE',
            'street_name': 'Shaun glen',
            'street_number': '9',
            'postal_code': 'LN4 1LE',
        },
        {
            'full_address':
            '11-59 High Road\nEast Finchley London\nN2 8AW, UK',
            'street_name': 'High Road',
            'street_number': '11-59',
            'postal_code': 'N2 8AW',
            'country': 'UK',
        },
        {
            'full_address':
            'Studio 53, Harrison cove, Smithbury, G88 4US, United Kingdom',
            'occupancy': 'Studio 53',
            'street_name': 'Harrison cove',
            'postal_code': 'G88 4US',
            'country': 'United Kingdom',
        },
    ]
    filler_text = "This is filler text that can be inserted both before and after addresses"
    punctuation = ["\n", ", ", ". ", " "]

    # Test each of the above addresses
    for address_parts in example_addresses:
        # Test with filler text before and after the address
        for filler_before, filler_after in itertools.product([False, True],
                                                             [False, True]):
            # Use the following punctuation to join the filler text and the address
            for join_string in punctuation:
                filler_text_before = (filler_text +
                                      join_string) if filler_before else ''
                filler_text_after = (join_string +
                                     filler_text) if filler_after else ''
                address_text = filler_text_before + address_parts[
                    'full_address'] + filler_text_after

                parsed = pyap.parse(address_text, country='GB')
                print(
                    pyap.parser.AddressParser._normalize_string(address_text))
                # Ensure that only one address is found
                assert len(parsed) == 1
                for k, v in six.iteritems(address_parts):
                    if k == 'full_address':
                        assert parsed[
                            0].full_address == pyap.parser.AddressParser._normalize_string(
                                v)
                    else:
                        # assert that every item in the above address dictionaries match the parsed address
                        assert parsed[0].__getattribute__(k) == v
Example #17
0
    def iter_filth(self, text, document_name: Optional[str] = None):
        addresses = pyap.parse(text, country=self.region)
        for address in addresses:
            # Ignore any addresses containing any explitally ignored words
            if any([
                    word.lower() in address.full_address.lower()
                    for word in self.ignored_words
            ]):
                # print("contains an ignored word")
                continue

            postal_address = None
            if self.minimum_address_sections > 0:
                postal_address = postal.parser.parse_address(
                    address.full_address)
                # Ensure that there are enough parts of the address to be a real address
                if len(postal_address) < self.minimum_address_sections:
                    # print("address too short")
                    continue

            if len(self.match_pyap_postal_fields) > 0:
                if postal_address is None:
                    postal_address = postal.parser.parse_address(
                        address.full_address)
                # Check the two parses agree on part of the address
                for pyap_field, postal_field in self.match_pyap_postal_fields.items(
                ):
                    if not address.__getattribute__(pyap_field).lower() in [
                            part[0] for part in postal_address
                            if part[1] == postal_field
                    ]:
                        continue

            # It seems to be a real address, lets look for it in the text
            # This is needed as pyap does some text normalisation, this undoes that normalisation
            # See _normalize_string() in https://github.com/vladimarius/pyap/blob/master/pyap/parser.py
            pattern = re.escape(address.full_address)
            pattern = pattern.replace(r',\ ', r'\s*([\n,]\s*)+')
            pattern = pattern.replace(r'\ ', r'\s+')
            pattern = pattern.replace('-', '[‐‑‒–—―]')
            pattern = r'\b' + pattern + r'\b'
            found_strings = re.finditer(pattern, text,
                                        re.MULTILINE | re.UNICODE)

            # Iterate over each found string matching this regex and yield some filth
            for instance in found_strings:
                yield self.filth_cls(
                    beg=instance.start(),
                    end=instance.end(),
                    text=instance.group(),
                    detector_name=self.name,
                    document_name=document_name,
                    locale=self.locale,
                )
Example #18
0
def hello_from_body(args):
    """Method 3: Return hello with name, given in body"""
    text = args.get("textBlob", "")
    addresses = pyap.parse(text, country='US')
    addr = []
    for address in addresses:
        addr.append(str(address))
    html = args.get("htmlBlob", "")
    url = args.get("url", "")
    writeToGcs(url, html)
    urlId = getUrlId(url)
    return {"addresses": addr, "urlId": urlId}, 200
Example #19
0
def parse_adress(colonne, no_match=no_match):
    try:
        parsed_obj = pyap.parse(colonne, country='CA')
        try:
            parsed = parsed_obj[0].as_dict()
        except IndexError:
            parsed = ""
    except TypeError:
        print(colonne)
        parsed = ""
    parsed = json.dumps(parsed)
    return parsed
Example #20
0
def parse(page):
    country = ['US','CA']
    is_US =True
    locations =[]
    parsed= []
    address = pyap.parse(page, country=country[0])
    if address == []:
        is_US =False
        address = pyap.parse(page, country=country[1])

    for a in address:
        if a in parsed:
            continue
        else:
            parsed.append(a)
    for setter in parsed:
        if is_US:
            parsed = US.parse(str(setter))
        else:
            parsed = AddressParser().parse(str(setter))
        locations.append(parsed)
    return locations
Example #21
0
def process_text(q):
    doc = nlp(q)
    matches = matcher(doc)

    merge_and_add_ents(doc, matches)

    for address in pyap.parse(q, country='US'):
        spn = find_span(doc, nlp(str(address).decode('utf-8')))
        doc.ents += ((ADDRESS_ID, spn[0], spn[1]), )

    filtered_ents = [ent for ent in list(doc.ents) if ent.label_ in ENT_LIST]

    return doc, filtered_ents
Example #22
0
def fetch_address(arr_body):
    msg_body = preprocess(arr_body)
    msg_body = '\n'.join(msg_body)
    # print msg_body
    addresses = pyap.parse(msg_body, country='US')
    location = []
    try:
        for address in addresses:
            # shows found address
            location.append(address)
            # print address
    except:
        pass
    return location
    def _extract_from_possible_value(self, field, possible_value):
        if not possible_value:
            return None

        if type(possible_value) is dict:
            address = possible_value.get('address')
        else:
            addresses = list(pyap.parse(str(possible_value), country='US'))

            if not addresses:
                addresses = list(get_addresses(str(possible_value)))

            address = addresses[0] if addresses else str(possible_value)

        return AddressField._get_from_geocode(address)
        def find_base(soup, country='us'):
            """
            Find addresses using pyap package
            """
            for script in soup(["script", "style"]):
                script.extract()
            text = soup.get_text()
            address = ''

            adr = pyap.parse(text, country='us')
            if len(adr) > 0:
                for item in adr:
                    address = address + ' ' + str(item)

            return address
    def _extract_variants_from_text(self, field, text: str, **kwargs):
        addresses = list(pyap.parse(text, country='US'))
        result = []

        if not addresses:
            addresses = list(get_addresses(text))

        resolved_addresses = {}
        while addresses:
            address = addresses.pop(0)
            resolved_address = resolved_addresses.get(address)
            if resolved_address is None:
                resolved_address = AddressField._get_from_geocode(address)
                resolved_addresses[address] = resolved_address
            result.append(resolved_address)
        return result
def allot_values():  # TODO: Address finder breaks when state abbrev missing. Could find by zip code.
    for v in final_sheet[1:]:  # TODO: Address finder also breaks "Fort Wayne" into "Fort Way" "NE"
        address = pyap.parse(v[12].upper(), country='US')  # Made Upper because Lower and Title confuse pyap
        if address:
            address_list = [address[0].as_dict()['street_number'], title(address[0].as_dict()['street_name']),
                            title(address[0].as_dict()['street_type']), address[0].as_dict()['route_id'],
                            address[0].as_dict()['post_direction']]
            address1 = [x for x in address_list if x]
            v[1] = ' '.join(address1)
            address2 = [title(address[0].as_dict()['floor']), title(address[0].as_dict()['building_id']),
                        title(address[0].as_dict()['occupancy'])]
            address2 = [x for x in address2 if x]
            v[2] = ' '.join(address2)
            v[3] = title(address[0].as_dict()['city'])
            v[4] = address[0].as_dict()['region1']
            v[5] = address[0].as_dict()['postal_code']

        urls = URLExtract(extract_email=True).find_urls(v[12].lower())

        if urls:
            for url in urls[::-1]:
                if '@' in url:  # This is a simplistic way to find email, a url could also have an @
                    v[7] = url  # Overwriting because I have nowhere to store additional urls/emails
                else:
                    v[11] = url

        fax = v[12].lower().find('fax')
        if fax > -1:  # Find returns -1 if no instance found
            try:
                match = phonenumbers.PhoneNumberMatcher(v[12][fax:], 'US').next()
                v[12] = v[12][:fax] + v[12][fax:][0:match.start] + '[Redacted]' + v[12][fax:][match.end:]
                v[10] = phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.RFC3966)[7:]
            except StopIteration:
                pass

        for match in phonenumbers.PhoneNumberMatcher(v[12], "US"):
            if not v[8]:
                v[8] = phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.RFC3966)[7:]
            elif not v[9]:
                v[9] = phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.RFC3966)[7:]
            elif not v[10]:
                v[10] = phonenumbers.format_number(match.number, phonenumbers.PhoneNumberFormat.RFC3966)[7:]
        v[6] = 'service'
        v[14:19] = ['08:00-17:00'] * 5
Example #27
0
    def extract_entities(self, text):

        regexs = {
            'TIME': utils.TIME_REGEX,
            'DATE': utils.DATE_REGEX,
            'ZIP': utils.ZIP_REGEX,
            'EMAIL': utils.EMAIL_REGEX,
            'CURRENCY': utils.CURRENCY_REGEX,
            'TAX_ID': utils.TAX_REGEX,
            'PHONE_NUMBER': utils.PHONE_NUMBER_REGEX,
        }

        long_address_matches = pyap.parse(text, country='US')
        short_address_matches = re.findall(utils.SHORT_ADDRESS_REGEX, text)
        if len(long_address_matches) != 0:
            for address in long_address_matches:
                text = text.replace(str(address), ' ', 1)
                self.entities.append({
                    'text': str(address),
                    'type': self.CODEC['LOCATION'],
                    'score': 1
                })

        elif len(short_address_matches) != 0:
            for address in short_address_matches:
                text = text.replace(str(address), ' ', 1)
                self.entities.append({
                    'text': str(address),
                    'type': self.CODEC['LOCATION'],
                    'score': 1
                })

        for key in regexs:
            matches = re.findall(regexs[key], text)
            if len(matches) != 0:
                for ent in matches:
                    ent = ent[0] if isinstance(ent, tuple) else ent
                    text = text.replace(ent, '  ', 1)
                    self.entities.append({
                        'text': str(ent),
                        'type': self.CODEC[key],
                    })

        return self.entities
Example #28
0
def fetch_address(arr_body):
    '''takes input list of body
	and extract address from it
	'''
    msg_body = preprocess(arr_body)  # refine the body
    msg_body = '\n'.join(msg_body)  # make a string out of the list
    addresses = pyap.parse(
        msg_body, country='US'
    )  # selecting the country as 'US' parse the body and extract address
    location = []  # to include the address

    try:  # if addresses contains any element
        for address in addresses:
            # shows found address
            location.append(address)  # apppend the address
            # print address
    except:  # if no elements simply pass
        pass
    return location
def addresser():

    #test_address = "Lorem ipsum 225 E. John Carpenter Freeway, Suite 1500 Irving, Texas 75062 Dorem sit amet + 1733 Kellogg Springs Dr. Atlanta, GA 30338 "
    test_address = request.form.get("address")
    # test_address = request.get_json(force=True)
    addresses = pyap.parse(test_address, country='US')
    print(addresses)
    if not addresses:
        data = 'There is no addresses present'
        return jsonify(data)
    full_street = []
    state = []
    street_number = []
    street_name = []
    zip_code = []
    for address in addresses:

        # shows found address
        # shows address parts
        full_address = address.as_dict()
        full_street.append(full_address['full_street'])
        zip_code.append(full_address['postal_code'])
        state.append(full_address['region1'])
        street_number.append(full_address['street_number'])
        street_name.append(full_address['street_name'])

    full_street = pd.Series(full_street)
    state = pd.Series(state)
    street_name = pd.Series(street_name)
    street_number = pd.Series(street_number)
    zip_code = pd.Series(zip_code)
    full_street.name = 'full_street'
    state.name = 'state'
    zip_code.name = 'zip_code'
    street_name.name = 'street_name'
    street_number.name = 'street_name'
    data = pd.DataFrame(
        pd.concat([full_street, state, zip_code, street_name, street_number],
                  axis=1,
                  sort=False))
    cd.get_data(data)
    return jsonify(full_address)
Example #30
0
    def Text_to_String(self, filename):
        ret = []
        noaddylist = []
        addylist = []
        if (self.debug):
            cwd = os.path.join(self.og, "CSVFiles")

        print(os.path.isdir(filename))
        print(filename)
        cwd = os.path.join(self.og, filename)
        os.chdir(cwd)
        print(cwd)

        for file in glob.glob('*.txt'):
            temp = open(file, 'r').read().strip()
            addresses = []
            addresses = pyap.parse(temp, country='US')
            # print(addresses)
            addy = []

            for address in addresses:
                addy.append(str(address))

            if (addy == []):
                print(addy[0], "no addres!")
                tlist = [file]
                noaddylist.append(file)
            else:
                print(addy[0], "found address with name", file)
                tlist = [file, addy[0]]
                addylist.append(tlist)

            os.remove(file)
        ret = [addylist, noaddylist]
        print(ret, "= ret")
        cwd = os.chdir("../")
        print(cwd)

        os.rmdir(filename)
        return ret
def extract_cities(document):
    places = GeoText(document)
    print 'Cities :', places.cities
    print 'Countries :', places.countries
    city = places.cities
    if places.cities:
        print('Address')
        r2 = re.compile(r'([(\d|-|/){1-5}]+[,|-|\s]+[A-zZ]+[Aa-zZ]+.*)')
        add = r2.findall(document)
        # print add
        for text in add:
            for text1 in places.cities:
                if text1 in text:
                    print(text)
        # print (r2.findall(document))
    elif places.cities is None:
        addresses = pyap.parse(document, country='US')
        for address in addresses:
            # shows found address
            print(address)

    else:
        print('No Address Found')