Example #1
0
def trainIncoming(name):
    from geocoder.deduper import DatabaseGazetteer
    import simplejson as json
    import dedupe

    engine = create_engine(DB_CONN)
    
    deduper = DatabaseGazetteer([{'field': 'complete_address', 'type': 'Address'}],
                                engine=engine)
    
    sql_table = checkForTable(engine, name)
    
    if sql_table == None:
        sys.exit()

    primary_key = sql_table.primary_key.columns.keys()[0]
    
    messy_table = ''' 
        SELECT {0}, complete_address
        FROM {1}
        WHERE address_id IS NULL
          AND complete_address IS NOT NULL
    '''.format(primary_key, name)

    curs = engine.execute(messy_table)

    messy_data = ({'complete_address': r.complete_address} for r in curs)

    deduper.drawSample(messy_data, sample_size=30000)
    
    if os.path.exists('geocoder/data/training.json'):
        print('reading labeled examples from geocoder/data/training.json')
        with open('geocoder/data/training.json') as tf :
            deduper.readTraining(tf)
    
    dedupe.consoleLabel(deduper)

    deduper.train(ppc=0.1, index_predicates=False)
    
    # When finished, save our training away to disk
    with open('geocoder/data/training.json', 'w') as tf :
        deduper.writeTraining(tf)

    # Save our weights and predicates to disk.  If the settings file
    # exists, we will skip all the training and learning next time we run
    # this file.
    with open('geocoder/dedupe.settings', 'wb') as sf :
        deduper.writeSettings(sf)

    deduper.cleanupTraining()
Example #2
0
        #
        # suburbs.mergeTables()

        connection.close()

    if args.train:
        from geocoder.deduper import DatabaseGazetteer
        import simplejson as json
        import dedupe

        from geocoder.app_config import DB_CONN
        engine = create_engine(DB_CONN)

        deduper = DatabaseGazetteer([{
            'field': 'complete_address',
            'type': 'Address'
        }],
                                    engine=engine)

        messy_data = json.load(open('geocoder/data/messy_addresses.json'))
        deduper.drawSample(messy_data, sample_size=30000)

        if os.path.exists('geocoder/data/training.json'):
            print('reading labeled examples from geocoder/data/training.json')
            with open('geocoder/data/training.json') as tf:
                deduper.readTraining(tf)

        dedupe.consoleLabel(deduper)

        deduper.train(ppc=0.1, index_predicates=False)
Example #3
0
        # 
        # suburbs.run(download_url=download_url)
        # 
        # suburbs.mergeTables()

        connection.close()

    if args.train:
        from geocoder.deduper import DatabaseGazetteer
        import simplejson as json
        import dedupe

        from geocoder.app_config import DB_CONN
        engine = create_engine(DB_CONN) 
        
        deduper = DatabaseGazetteer([{'field': 'complete_address', 'type': 'Address'}],
                                    engine=engine)

        messy_data = json.load(open('geocoder/data/messy_addresses.json'))
        deduper.drawSample(messy_data, sample_size=30000)
        
        if os.path.exists('geocoder/data/training.json'):
            print('reading labeled examples from geocoder/data/training.json')
            with open('geocoder/data/training.json') as tf :
                deduper.readTraining(tf)
        
        dedupe.consoleLabel(deduper)

        deduper.train(ppc=0.1, index_predicates=False)
        
        # When finished, save our training away to disk
        with open('geocoder/data/training.json', 'w') as tf :
Example #4
0
def trainIncoming(name):
    from geocoder.deduper import DatabaseGazetteer
    import simplejson as json
    import dedupe

    engine = create_engine(DB_CONN)

    deduper = DatabaseGazetteer([{
        'field': 'complete_address',
        'type': 'Address'
    }],
                                engine=engine)

    sql_table = checkForTable(engine, name)

    if sql_table == None:
        sys.exit()

    primary_key = sql_table.primary_key.columns.keys()[0]

    messy_table = ''' 
        SELECT {0}, complete_address
        FROM {1}
        WHERE address_id IS NULL
          AND complete_address IS NOT NULL
    '''.format(primary_key, name)

    curs = engine.execute(messy_table)

    messy_data = ({'complete_address': r.complete_address} for r in curs)

    deduper.drawSample(messy_data, sample_size=30000)

    if os.path.exists('geocoder/data/training.json'):
        print('reading labeled examples from geocoder/data/training.json')
        with open('geocoder/data/training.json') as tf:
            deduper.readTraining(tf)

    dedupe.consoleLabel(deduper)

    deduper.train(ppc=0.1, index_predicates=False)

    # When finished, save our training away to disk
    with open('geocoder/data/training.json', 'w') as tf:
        deduper.writeTraining(tf)

    # Save our weights and predicates to disk.  If the settings file
    # exists, we will skip all the training and learning next time we run
    # this file.
    with open('geocoder/dedupe.settings', 'wb') as sf:
        deduper.writeSettings(sf)

    deduper.cleanupTraining()