コード例 #1
0
def tree_click(event):
    print ('click')
    # get the address
    for item in tree_csv.selection():
        item_text = tree_csv.item(item,"values")
        print(item_text[0])
        addr = item_text[1]
    
    # clear the expand treeview
    for x in tree_expand.get_children():
        tree_expand.delete(x)
    
    # clear the parse treeview
    for x in tree_parse.get_children():
        tree_parse.delete(x)

    # expand the address string
    list_addr = expand_address(addr)
    
    # insert the address expanded to expand treeview
    for addr in list_addr:
        print('type of addr: {}'.format(type(addr)))
        addr_str = '\"' + addr + '\"'
        tree_expand.insert('',0,text='Expand',values=addr_str)
    
    # parse the address
    addr_parse = parse_address(addr)
    # conversion the results to dict, and reverse the value-key order
    dict_addr = dict((x[1], x[0]) for x in addr_parse[0:])
    # insert the parsed results to parse treeview
    for item in dict_addr.items():
        tree_parse.insert('',0,values=list(item))
    
    return addr # useless now
コード例 #2
0
ファイル: api.py プロジェクト: raph84/libpostal-api
def parse():
    body = request.get_json()
    input_str = body['request']
    parsed = parse_address(input_str)
    #parsed = [item[0].upper() for item in parsed]
    body['result'] = parsed
    return json.dumps(body)
コード例 #3
0
def find_house_number(street, house_number):
    if house_number != "" and not pd.isnull(house_number):
        return house_number

    lpost = parse_address(street)
    lpost = {x: y for (y, x) in lpost}
    return lpost["house_number"] if "house_number" in lpost else np.NaN
コード例 #4
0
ファイル: main.py プロジェクト: ambidextrous/address_parser
def format_address(raw_address: str) -> Dict[str, str]:
    """
    Calls pypostal NLP library to parse street and housenumber data from a
    given address string: https://github.com/openvenues/pypostal

    format_address('Calle 39 No 1540') 
    ->
    expected_address = {"street": "Calle 39", "housenumber": "No 1540"}
    """
    if not raw_address:
        raise ValueError(f"Cannot extract address data from {raw_address}")
    try:
        parsed_address = parse_address(raw_address)
        house_number = [
            item[0] for item in parsed_address if item[1] == "house_number"
        ][0]
        road = [item[0] for item in parsed_address if item[1] == "road"][0]
        formatted_address = {
            "street": fix_capitalization(road, raw_address),
            "housenumber": fix_capitalization(house_number, raw_address),
        }
    except IndexError as ex:
        message = f"Unable to parse street and house number data from input `{raw_address}` (parsed to `{parsed_address}`)"
        raise ValueError(message)
    return formatted_address
コード例 #5
0
ファイル: m_and_r.py プロジェクト: oisindoherty3/drem
def _parse_standardised_address(
    df: pd.DataFrame,
    target: str,
    result: str,
) -> pd.DataFrame:

    df[result] = df.copy()[target].apply(lambda cell: parse_address(cell))

    return df
コード例 #6
0
ファイル: api.py プロジェクト: manycoding/address-parser
    def post(self, address: str) -> Tuple[Dict, int]:
        """Parse an address string
        Returns:
            A tuple of dict with message and results, and a status code.
            Result contains street address, city, zip
        """
        parsed = parse_address(address)

        return get_address(parsed), 200
コード例 #7
0
def parse():
    body = request.get_json()
    input_str = body['request']
    parsed = parse_address(input_str)

    result = {}
    for item in parsed:
        result[item[1]] = item[0]

    return result
コード例 #8
0
def address_parser(user_input):
    nlp = spacy.load('nl_core_news_sm')
    doc = nlp(user_input)
    sample_list = []
    for entity in doc.ents:
        sample_list.append(entity.text)

    text_lib = ' '.join(sample_list)

    return jsonify(parse_address(text_lib))
コード例 #9
0
ファイル: python-app.py プロジェクト: tanvp112/projects
 def post(self):
     args = addressText.parse_args()
     fullAddressText = args['addressText']
     resp = parse_address(fullAddressText)
     finalResponse = dict((y, x) for x, y in resp)
     #        finalResponse = {}
     #        for loopera in resp:
     #          key = loopera[1]
     #          value = loopera[0]
     #          finalResponse[key] = value
     return jsonify(finalResponse)
コード例 #10
0
def handler(event, context):
    parse_output = parse_address(event['address'])
    result = dict()

    for i in parse_output:
        result[i[1]] = i[0]

    print("Result:")
    print(result)

    return result
コード例 #11
0
ファイル: file_chooser.py プロジェクト: Faisalsouz/PAN_OCR
def footer_text(string):
    '''
    input:
    any stirng contaiing address like data
    output:
    address dictionary
    '''
    rx = r'''(\b(Tel|Fax)\s*.*\d+|(\bE-?[mM]ail.*de)|([iI]nternet:?\s?[hH]ttps?:.*.de)|([Gg]esch[aä]ftsführer.+\S\s))'''
    s = re.sub(rx, '', string, re.UNICODE|re.MULTILINE)
    d = dict(map(reversed,parse_address(s, language='de', country='germany')))
    #n_d = dict([(value, key) for key, value in d.items()])
    return d
コード例 #12
0
ファイル: test_parser.py プロジェクト: socrateslab/pypostal
    def contains_components(self, address, components):
        """Test whether address parse contains specific components."""
        expected = len(components)
        got = 0

        parsed = parse_address(address)
        self.assertTrue(parsed)

        for s, c in parsed:
            if components.get(c, None) == s:
                got += 1

        self.assertEqual(expected, got)
コード例 #13
0
def format_parser(add):

    # libpostal returns a list of tuples, this just converts it to a dictionary
    A = parse_address(add)
    B = dict((x, y) for (y, x) in A)
    key_list = ['house_number', 'road', 'city', 'state', 'postcode', 'unit']
    parsed = []
    for k in key_list:
        if k in B.keys():
            parsed.append(B[k])
        else:
            parsed.append('')
    return parsed
コード例 #14
0
ファイル: test_parser.py プロジェクト: Datactuariat/pypostal
    def contains_components(self, address, components):
        """Test whether address parse contains specific components."""
        expected = len(components)
        got = 0

        parsed = parse_address(address)
        self.assertTrue(parsed)

        for s, c in parsed:
            if components.get(c, None) == s:
                got += 1

        self.assertEqual(expected, got)
コード例 #15
0
ファイル: model_run.py プロジェクト: Faisalsouz/PAN_OCR
def model_run(dn_cmd,pdf_path:bytes) ->List:
    '''
    input= darkent model command\
    that will be fed into RecogPipe class
    pdf path has two effect one feeding into RecongPipe class in Darkent Command\
    second feeding the image to RecogPipe method 'ocr' as argument (image path to crop)
    output= json dictinary that append to existing json dictionary
    '''
    #supper_dict=defaultdict(list)
    supper_dict= []
    for index,pg in enumerate(get_pdf(pdf_path)):
        #pg= wi(image=pg)

        cls=RcogPipe(dn_cmd,pg)
        j_dict=cls.ocr(pg)
        page={'PageNo:':str(index)}
        m1={**page,**j_dict}
        if 'supplier:' in m1.keys():
            d=dict(parse_address(m1.get('supplier:')[0]))# seggregate the address.
            d=dict([(value, key) for key, value in d.items()])# correctng  the key value pair order
            m1['supplier:']=d
        if 'footer:' in m1.keys():
            ft = dict(parse_address(m1.get('footer:')[0]))
            ft = dict([(value, key) for key, value in ft.items()])
            m1['footer:'] = ft



        os.rename('./predictions.jpg', './predictions' + str(index) + '.jpg')
        supper_dict.append(m1)
        # supper_dict['pageNo'].append(index)
        # for k,v in dict.items():
        #     supper_dict[k].append(v)
        #     print('combining page data into json')
    #json_dic=json.dumps(supper_dict)
    print('Data from all PDF pages has been extracted successfully!')
    #pprint(supper_dict)
    return supper_dict
コード例 #16
0
def enrich_item_with_variants(item):
    label = item['label']

    # Acronyms
    for (acro, variant) in extractAcronymsByColocation(label):
        item['variants'].add(variant)
        item['acros'].add(acro)

    # Addresses (French or foreign)
    addr = parse_address(label)
    features = dict((f, v) for (v, f) in addr)
    if len(REQUIRED_ADDR_FEATURES | features.keys()) > 0:
        item['address_as_label'] = label
        if 'city' in features: item['city'] = features['city']
        if 'country' in features: item['country'] = features['country']

    # Unité Mixte de Recherche and such things
    for (kind, regex) in UR_REGEXES_LABEL.items():
        ms = re.findall(regex, label)
        if ms:
            for m in ms:
                variant = regex_variant(kind, m)
                logging.info('Found UMR-type match: {} in label "{}"'.format(
                    variant, label))
                item['variants'].add(variant)
                item['ur_id'] = variant
    if 'url' in item:
        url = item['url']
        for (kind, regex) in UR_REGEXES_URL.items():
            ms = re.findall(regex, url)
            if ms:
                for m in ms:
                    variant = regex_variant(m)
                    logging.info('Found UMR-type match: {} in URL {}'.format(
                        variant, url))
                    item['variants'].add(variant)
                    item['ur_id'] = variant

    # Categorization
    for token in item['tokens']:
        cat = categorize(token)
        if cat is not None: item['categories'].add(cat)

    # Duplicated tokens
    i = label.find('')
    if i > 0:
        pre = justCase(label[:i])
        post = justCase(label[i + 1])
        if post.startswitch(pre): item['variants'].add(post)
コード例 #17
0
def expand_click(event):
    print('focus: {}'.format(tree_csv.focus()))
    # clear the parse treeview
    for x in tree_parse.get_children():
        tree_parse.delete(x)
    
    # get the address string from expand treeview
    item_text = tree_expand.item(tree_expand.selection(),"values")[0]
    # parse the address
    addr = parse_address(item_text)
    # conversion the results to dict, and reverse the value-key order
    dict_addr = dict((x[1], x[0]) for x in addr[0:])
    # insert the parsed results to parse treeview
    for item in dict_addr.items():
        tree_parse.insert('',0,values=list(item))
コード例 #18
0
def normalize_addr(entry):

    addr_to_parse = entry['exp_addr']
    x = parse_address(
        addr_to_parse,
        # adapt address parsing to known language and country of known
        # for improvided parsing
        language='en',
        country='us')

    for val_combo in x:
        column_val = val_combo[0]
        column_name = val_combo[1]
        full_col_name = 'pypost_' + column_name
        entry[full_col_name] = column_val

    return entry
コード例 #19
0
def segment_address(address):
    """
    Segment the address string into its components using the libpostal library and store it in a dictionary format.
    """
    address_dict = {}
    address_list = parse_address(address)
    for value, key in address_list:
        capitalised_value = ''
        for item in value.split():
            capitalised_value += item.capitalize() + ' '
        address_dict[key] = capitalised_value[:-1]

    formatted_address = ''
    for key, value in address_dict.items():
        formatted_address += value + ' '
    address_dict['formatted_address'] = formatted_address[:-1]

    return address_dict
コード例 #20
0
def focus_to(event):
    """method for select row by enter node_id"""
    tex = var01.get() # get node id from entry box
        
    # id in the tree or not
    if df_process.index.contains(tex):
        
        tree_idx = df_process.loc[tex,'ids'] # get selected index id from Dataframe
        tree_csv.selection_set(tree_idx) # highlight row which selected
        tree_csv.see(tree_idx) # make the row selected visiable
    
        # get the address
        for item in tree_csv.selection():
            item_text = tree_csv.item(item,"values")
            print(item_text[0])
            addr = item_text[1]
        
        # clear the expand treeview
        for x in tree_expand.get_children():
            tree_expand.delete(x)
        
        # clear the parse treeview
        for x in tree_parse.get_children():
            tree_parse.delete(x)

        # expand the address string
        list_addr = expand_address(addr)
        
        # insert the address expanded to expand treeview
        for addr in list_addr:
            print('type of addr: {}'.format(type(addr)))
            addr_str = '\"' + addr + '\"'
            tree_expand.insert('',0,text='Expand',values=addr_str)
        
        # parse the address
        addr_parse = parse_address(addr)
        # conversion the results to dict, and reverse the value-key order
        dict_addr = dict((x[1], x[0]) for x in addr_parse[0:])
        # insert the parsed results to parse treeview
        for item in dict_addr.items():
            tree_parse.insert('',0,values=list(item))
    else:
        print('id wrong.')
        messagebox.showerror('Error','The ID which your enter not in the list.')
コード例 #21
0
def addressParser(inputAddress):

    parsedAddress = parse_address(inputAddress)
    print(parsedAddress)
    try:
        try:
            # extract the first house_number value
            parsedHouseNumber = [element for element in parsedAddress
                                 if 'house_number' in element[1]][0][0]

        except IndexError:
            parsedHouseNumber = ''

        houseNumber = re.search(
            parsedHouseNumber, inputAddress, flags=re.IGNORECASE).group(0)

        streetArray = [element for element in parsedAddress
                       if 'house_number' not in element[1]]

        parsedStreet = ' '.join([str(x[0]) for x in streetArray])

        # substitute houseNumber with empty in the input string,
        # igore leading and trailing space and specific chracter
        street = re.sub(
                        houseNumber, '',
                        inputAddress,
                        flags=re.IGNORECASE
                        ).rstrip(
                                    '}{[]()?@$%^*<>/\\\"\'~;:-_, '
                        ).lstrip(
                                    '}{[]()?@$%^*<>/\\\"\'~;:-_,. '
                        ).replace(
                                    '  ', ' '
                        ).replace(
                                    ' ,', ','
                        )

        addressDict = OrderedDict([("street", street),
                                  ("housenumber", houseNumber)])
        # retnrn json object with orderedDict
        return json.dumps(addressDict, ensure_ascii=False)
    except Exception as e:
        print(e)
コード例 #22
0
def getaddress(text):
    match = re.search(
        r'\A(.*?)\s+(\d+[a-zA-Z]{0,1}\s{0,1}[-]{1}\s{0,1}\d*[a-zA-Z]{0,1}|\d+[a-zA-Z-]{0,1}\d*[a-zA-Z]{0,1})', text)
    if match is not None:
        return match
    address = parse_address(text)

    for i in range(len(address)):

        if address[i][1] == "road":
           # print "Road " + address[i][0]
            UserAddress = address[i][0]
            UserAddress.ljust(2)
        if address[i][1] == "house_number":
            if (address[i][0]).isdigit():
               # print "House number" + address[i][0]
                if UserAddress is not None:
                    UserAddress = UserAddress + " " + address[i][0]

    return UserAddress
コード例 #23
0
ファイル: address_parser.py プロジェクト: abrahamy/hello-app
def parse(address: str) -> tuple[str]:
    try:
        if not isinstance(address, str):
            raise TypeError("`address` is not a string.")

        parts = parser.parse_address(address)
        parts = {k: v.lower() for v, k in parts}
        house_number, street = parts["house_number"], parts["road"]

        # match input case
        words = address.translate(str.maketrans("", "", string.punctuation)).split()
        for word in words:
            lower = word.lower()
            street = street.replace(lower, word)
            house_number = house_number.replace(lower, word)

        return {"street": street, "housenumber": house_number}
    except (KeyError, TypeError, ValueError) as e:
        logger.warning(e)
        raise ParseError(f"Unable to parse address: {address}")
コード例 #24
0
ファイル: osm_tagger.py プロジェクト: GRSEB9S/geolabelling
    def label_values(self, values, regions):
        """
        The osm names require a set of potential admin_level 8 regions.
        These could be gathered from other columns, metadata,
        or other parts of the strings within the same column.
        :param values: set of string values, e.g., from a CSV column
        :param regions: set of geonames IDs (not URLs). Only osm names within these regions will be considered
        :return:
        """
        places = defaultdict(list)
        roads = defaultdict(list)
        val_count = 0.

        for i, value in enumerate(values):
            if value.strip():
                val_count += 1
                addr = parse_address(value)

                for parsed in addr:
                    if parsed[1] in ROAD:
                        roads[parsed[0]].append(i)
                    if parsed[1] in PLACE:
                        places[parsed[0]].append(i)

        # TODO: min number of potential roads in column
        labelled_roads = self.find_osm_names(roads, regions)

        match = 0.
        labelled_v = ['' for _ in range(len(values))]
        for r in labelled_roads:
            for i in roads[r]:
                l = labelled_roads[r]
                labelled_v[i] = l
                match += 1

        confidence = match / val_count if val_count > 0 else 0.
        return labelled_v, confidence
コード例 #25
0
 def handle(self, *args, **options):
     self.stdout.write("Loaded location server", ending='\n')
     main_socks, read_socks, write_socks = socket_bind('', 50006)
     while True:
         readable, writeable, exceptions = select(read_socks, write_socks,
                                                  [])
         for sockobj in readable:
             if sockobj in main_socks:
                 new_sock, address = sockobj.accept()
                 print('Connect:', address, id(new_sock))
                 read_socks.append(new_sock)
             else:
                 try:
                     data = recv_end(sockobj)
                     if not data:
                         sockobj.close()
                         read_socks.remove(sockobj)
                     else:
                         new_data = parse_address(data)
                         sockobj.sendall(
                             json.dumps(new_data).encode('utf8') +
                             '--end--'.encode('utf8'))
                 except:
                     pass
コード例 #26
0
def get_store_name_from_text(text):
    address = parse_address(text)
    for item, kind in address:
        if kind == "house":
            return item
コード例 #27
0
Uses an existing solution for the demonstration. For real usage, one should
train a model using for example AddressBase data for the best performance.
One should also consider using different labels (tokens).


Requirements
------------

:requires: libpostal (https://github.com/openvenues/libpostal)


Author
------

:author: Sami Niemi ([email protected])


Version
-------

:version: 0.1
:date: 29-Sep-2016
"""
from postal.parser import parse_address

if __name__ == "__main__":
    print('Example, parsing 6 PROSPECT GARDENS EXETER EX4 6BA:')
    print(parse_address('6 PROSPECT GARDENS EXETER EX4 6BA'))
    var = input('\nInput a string:')
    print(parse_address(var))
コード例 #28
0
ファイル: api.py プロジェクト: manycoding/address-parser
 def post(self, address: str) -> Tuple[Dict, int]:
     """Parse an address string with libpostal without any processing
     """
     return parse_address(address), 200
def address_parser(row, colname):
    return parse_address(str(row[colname]))
コード例 #30
0
            matches = datefinder.find_dates(
                string, index=True)  # parsing dates, return date index
            #%% if dates are found in texts, extract dates
            if list(datefinder.find_dates(string, index=True)):
                for match in matches:
                    date = match[0]
                    date_str = date.strftime(
                        '%Y-%m-%d')  #set format for dates (e.g., 2011-01-01)
                    df.loc[i]['date_iso'] = date_str
                    ranking_Y = ranking_m = ranking_d = '1'

                    #%% then use the remaining string to parse addresses
                    letters = string[:match[1][
                        0]]  # extract address from text using date index
                    Parsed_address = parse_address(
                        letters
                    )  # parsing addresses, output example: (('Broadway','Road'),('NYC','City'))
                    parse_address_details(
                        Parsed_address
                    )  #split parsed addresses into road/city/country

            #%% if dates are not found in text, use the whole string to parse address
            else:
                letters = string
                Parsed_address = parse_address(
                    letters
                )  # parsing addresses, output example: (('Broadway','Road'),('NYC','City'))
                parse_address_details(
                    Parsed_address
                )  #split parsed addresses into road/city/country
コード例 #31
0
def main():
  print_with_timestamp("Starting run.")
  
  inputfile = "../Hennepin County Moving Violations 2010-2015.csv"
  interstates = ["494", "694", "394", "35w", "94"]
  us_highways = ["12", "169", "212"]
  state_routes = ["62", "77", "100", "101", "610"]
  directions = ["nb", "northbound", "eb", "eastbound", "sb", "southbound", "wb", "westbound"]

  outputfile = inputfile[:-4] + "_regularised_addresses.csv"
  with open(outputfile, 'w') as f_out:
    with open(inputfile, 'rU') as f_in:
      reader = csv.DictReader(f_in, dialect="excel")
      writer = csv.DictWriter(f_out, reader.fieldnames + ["parsed_address", "parsed_w_city", "parsed_w_state", "parsed_w_county_and_state"], encoding="utf8")
      writer.writeheader()
      n = 0
      for row in reader:
        offloctn = row["offloctn"].lower().replace("n/b", "").replace("e/b", "").replace("s/b", "").replace("w/b", "").replace("nb ", "").replace("eb ", "").replace("sb ", "").replace("wb ", "").replace(" nb", "").replace(" eb", "").replace(" sb", "").replace(" wb", "").replace("(", "").replace(")", "").replace("/", "&").replace("@", "&").replace(" from ", "&").replace(" at ", "&").replace("&&", "&")
#        print offloctn
        addr = ""
        for loc in offloctn.split("&"):
          if addr != "":
            addr = addr + " & "
          found = False
          for interstate in interstates:
            if loc.strip() == interstate:
              addr += "I-" + interstate
#              print loc, "||", row["offloctn"], "||", addr
              found = True
              break
          if not found:
            for hwy in us_highways:
              if loc.strip() == hwy:
                addr += "US-" + hwy
#                print loc, "||", row["offloctn"], "||", addr
                found = True
                break
          if not found:
            for rte in state_routes:
              if loc.strip() == rte:
                addr += "SR-" + rte
#                print loc, "||", row["offloctn"], "||", addr
                found = True
                break     
          if not found:
            newpart = ""
            for token in parse_address(loc + ", Hennepin County, Minnesota, USA"):
              if token[0] not in (u'hennepin county', u'minnesota', u'usa'):
                if newpart != "":
                  if token[1] in (u'suburb', u'city'):
                    newpart += ", "
                  else:
                    newpart += " "
                newpart += token[0]
                  
            addr = addr + newpart
#          print parse_address(loc + ", Hennepin County, Minnesota, USA")
#          print addr
        addr = addr.replace("cr ", "County Road").replace("co rd ", "County Road")
        row["parsed_address"] = addr
        row["parsed_w_city"] = addr + "Minneapolis, Minnesota, USA"
        row["parsed_w_state"] = addr + ", Minnesota, USA"
        row["parsed_w_county_and_state"] = addr + ", Hennepin County, Minnesota, USA"
#        print row["parsed_address"]
        writer.writerow(row)
        n = n + 1
        if n % 10000 == 0:
          print_with_timestamp("Wrote " + str(n) + " rows so far.")

  print_with_timestamp("Run complete.")
コード例 #32
0
import csv
from postal.parser import parse_address

file = "../data/ParkverstoesseBonn2017OpenData_raw.csv"

i = 0

with open(file, encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile, delimiter=';', quotechar='"')

    for row in reader:
        i += 1

        address = row[2].replace("Bonn, ", "").replace("Bonn , ", "").replace(
            "-", " ").replace("gegenüber Hnr", "")
        print(address)
        print(parse_address(address))
        if i == 1000:
            break