busqueda = Busqueda.objects.get(id=int(busqueda_in))
    except:
        logging.error('No busqueda object with id=' +busqueda_in+ ' in UNOPORUNO database.')
        exit(-1)
else:
    try:
        busqueda = Busqueda.objects.get(nombre=busqueda_in)
    except:
        logging.error('No busqueda object with id=' +busqueda_in+ ' in UNOPORUNO database.')
        exit(-1)
        
logging.info('Processing busqueda ' +busqueda.nombre )

#TODO: METER TODO EN UNA CLASE DENTRO DE unoporuno/modules/dospordos/features.py
OrganizationRegex = RegexFeature(UNOPORUNO_ROOT, 'organization')
CountryGazet = QualifiedGazetteerFeature(UNOPORUNO_ROOT, 'country',False)
CountryGazetCase = QualifiedGazetteerFeature(UNOPORUNO_ROOT, 'country',True)
CityGazet = QualifiedGazetteerFeature(UNOPORUNO_ROOT, 'city', False)
CityGazetCase = QualifiedGazetteerFeature(UNOPORUNO_ROOT, 'city', True)
AccronymGazet = GazetteerFeature(UNOPORUNO_ROOT, 'accronym', True)
BiophrasesRegex = RegexFeature(UNOPORUNO_ROOT, 'biographical phrases')
ProfessionRegex = RegexFeature(UNOPORUNO_ROOT, 'profession')
ProfessionGazt = GazetteerFeature(UNOPORUNO_ROOT, 'profession')
DegreeRegex = RegexFeature(UNOPORUNO_ROOT, 'degree')
DegreeGazt = GazetteerFeature(UNOPORUNO_ROOT, 'degree')
CvRegex = RegexFeature(UNOPORUNO_ROOT, 'cv general')
CvHttpRegex = RegexFeature(UNOPORUNO_ROOT, 'cv http')
LatinNatRegex = RegexFeature(UNOPORUNO_ROOT, 'latin nationalities')
WorldNatRegex = RegexFeature(UNOPORUNO_ROOT, 'world nationalities es')
WorldNatGazt = GazetteerFeature(UNOPORUNO_ROOT, 'world nationalities en')
EmailRegex = RegexFeature(UNOPORUNO_ROOT, 'email')
else:
    try:
        busqueda = Busqueda.objects.get(nombre=busqueda_in)
    except:
        logging.error('No busqueda object with id=' +busqueda_in+ ' in UNOPORUNO database.')
        exit(-1)

        
logging.info('Processing busqueda ' +busqueda.nombre )
try:
    selection = sys.argv[2]
except:
    selection = 'all'
#TODO: METER TODO EN UNA CLASE DENTRO DE unoporuno/modules/dospordos/features.py
OrganizationRegex = RegexFeature(UNOPORUNO_ROOT, 'organization')
CountryGazet = QualifiedGazetteerFeature(UNOPORUNO_ROOT, 'country',False)
CountryGazetCase = QualifiedGazetteerFeature(UNOPORUNO_ROOT, 'country',True)
CityGazet = QualifiedGazetteerFeature(UNOPORUNO_ROOT, 'city', False)
CityGazetCase = QualifiedGazetteerFeature(UNOPORUNO_ROOT, 'city', True)
AccronymGazet = GazetteerFeature(UNOPORUNO_ROOT, 'accronym', True)
BiophrasesRegex = RegexFeature(UNOPORUNO_ROOT, 'biographical phrases')
ProfessionRegex = RegexFeature(UNOPORUNO_ROOT, 'profession')
ProfessionGazt = GazetteerFeature(UNOPORUNO_ROOT, 'profession')
DegreeRegex = RegexFeature(UNOPORUNO_ROOT, 'degree')
DegreeGazt = GazetteerFeature(UNOPORUNO_ROOT, 'degree')
CvRegex = RegexFeature(UNOPORUNO_ROOT, 'cv general')
CvHttpRegex = RegexFeature(UNOPORUNO_ROOT, 'cv http')
LatinNatRegex = RegexFeature(UNOPORUNO_ROOT, 'latin nationalities')
WorldNatRegex = RegexFeature(UNOPORUNO_ROOT, 'world nationalities es')
WorldNatGazt = GazetteerFeature(UNOPORUNO_ROOT, 'world nationalities en')
EmailRegex = RegexFeature(UNOPORUNO_ROOT, 'email')
def main():

    try:
        busqueda_in = sys.argv[1]
    except:
        logging.error('No parameter busqueda')
        logging.error('Usage: python batch_biographic_filter.py NAME|NUMBER path')
        exit(-1)
    if busqueda_in.isdigit():
        try:
            busqueda = Busqueda.objects.get(id=int(busqueda_in))
        except:
            logging.error('No busqueda object with id=' +busqueda_in+ ' in UNOPORUNO database.')
            exit(-1)
    else:
        try:
            busqueda = Busqueda.objects.get(nombre=busqueda_in)
        except:
            logging.error('No busqueda object with id=' +busqueda_in+ ' in UNOPORUNO database.')
            exit(-1)

    logging.info('Processing busqueda ' +busqueda.nombre )

    #TODO: METER TODO EN UNA CLASE DENTRO DE unoporuno/modules/dospordos/features.py
    OrganizationRegex = RegexFeature(UNOPORUNO_ROOT, 'organization')
    CountryGazet = QualifiedGazetteerFeature(UNOPORUNO_ROOT, 'country',False)
    CountryGazetCase = QualifiedGazetteerFeature(UNOPORUNO_ROOT, 'country',True)
    CityGazet = QualifiedGazetteerFeature(UNOPORUNO_ROOT, 'city', False)
    CityGazetCase = QualifiedGazetteerFeature(UNOPORUNO_ROOT, 'city', True)
    AccronymGazet = GazetteerFeature(UNOPORUNO_ROOT, 'accronym', True)

    busqueda = Busqueda.objects.get(id=busqueda.id)
    for p in busqueda.persona_set.all():
        logging.info("processing person " +p.name)
        person_countries = []
        person_organizations = []
        top_snippets_count = 0
        p.vinculo_set.all().delete()
        for s in p.snippet_set.filter(FG=1).exclude(RE=1).filter(converging_pipelines=1):
            if s.FG==0 or s.RE_features<1:
                continue
            title_test_str = s.title.encode('utf-8')
            descr_test_str = s.description.encode('utf-8')
            #todo filtrar los snippets de acuerdo a las features que buscamos
            snippet_countries = []
            snippet_organizations = []
            orgs = OrganizationRegex.list_test(title_test_str)
            if len(orgs)>0:
                logging.debug ('FOUND ORGANIZATIONS ' +str(orgs)+' IN TITLE:\n ' +s.title)
                snippet_organizations += orgs
            orgs = OrganizationRegex.list_test(descr_test_str)
            if len(orgs)>0:
                logging.debug('FOUND ORGANIZATIONS ' +str(orgs)+ ' IN DESCR:\n ' +s.description)
                snippet_organizations += orgs
            accronyms = AccronymGazet.list_test(title_test_str)
            if len(accronyms)>0:
                logging.debug('FOUND ORG.ACCRONYMS ' +str(accronyms)+ ' IN TITLE:\n ' +s.title)
                snippet_organizations += accronyms
            accronyms = AccronymGazet.list_test(descr_test_str)
            if len(accronyms)>0:
                logging.debug('FOUND ORG.ACCRONYMS ' +str(accronyms)+ ' IN TITLE:\n ' +s.description)
                snippet_organizations += accronyms                
            countries = CountryGazet.list_test(title_test_str)
            if len(countries)>0:
                logging.debug ('FOUND COUNTRIES '+str(countries)+' IN TITLE:\n ' +s.title)
                snippet_countries += countries
            countries = CountryGazet.list_test(descr_test_str)
            if len(countries)>0:
                logging.debug('FOUND COUNTRIES '+str(countries)+' IN DESCR:\n ' +s.description)
                snippet_countries += countries
            countries = CountryGazetCase.list_test(title_test_str)
            if len(countries)>0:
                logging.debug ('FOUND COUNTRIES '+str(countries)+' IN TITLE:\n ' +s.title)
                snippet_countries += countries
            countries = CountryGazetCase.list_test(descr_test_str)
            if len(countries):
                logging.debug('FOUND COUNTRIES '+str(countries)+' IN DESCR:\n ' +s.description)
                snippet_countries += countries
            countries = CityGazet.list_test(title_test_str)
            if len(countries)>0:
                logging.debug('FOUND CITIES FROM COUNTRIES '+str(countries)+' IN CI TITLE\n' +s.title)
                snippet_countries += countries
            countries = CityGazet.list_test(descr_test_str)
            if len(countries)>0:
                logging.debug('FOUND CITIES FROM COUNTRIES '+str(countries)+' IN CI DESCRIPTION\n' +s.description)
                snippet_countries += countries
            countries = CityGazetCase.list_test(title_test_str)
            if len(countries)>0:
                logging.debug('FOUND CITIES FROM COUNTRIES '+str(countries)+' IN CD TITLE\n' +s.title)
                snippet_countries += countries        
            countries = CityGazetCase.list_test(descr_test_str)
            if len(countries)>0:
                logging.debug('FOUND CITIES FROM COUNTRIES '+str(countries)+' IN CD DESCRIPTION\n' +s.description)
                snippet_countries += countries
            person_countries += snippet_countries
            person_organizations += snippet_organizations
            top_snippets_count += 1
            if top_snippets_count in (5,10,15,20):
                dict_org = construye_dict_freq(person_organizations)
                dict_loc = construye_dict_freq(person_countries)
                list_org = dict_org.items()
                list_loc = dict_loc.items()
                sorted_list_org = sorted(list_org, key=lambda t:-t[1])
                sorted_list_loc = sorted(list_loc, key=lambda t:-t[1])
                organizations_str = ''
                locations_str = ''
                for e in sorted_list_org:
                    #logging.debug('type e='+str(type(e))+ ',type e[0]'+str(type(e[0]))+',e='+str(e))
                    organizations_str += e[0] + ' (' + str(e[1]) + ')\n' if len(e)>1 else ''
                for e in sorted_list_loc:
                    locations_str += e[0] + ' (' + str(e[1]) + ')\n' if len(e)>1 else ''

                logging.info('VÍNCULOS ORGANIZACIONES:: '+str(sorted_list_org))
                vinculo = Vinculo()
                vinculo.persona = p
                vinculo.organizaciones = organizations_str
                logging.info('VÍNCULOS LUGARES:: '+str(sorted_list_loc))
                vinculo.lugares = locations_str
                vinculo.descripcion = 'Top ' + str(top_snippets_count)
                vinculo.tipo = top_snippets_count
                vinculo.save()

        if top_snippets_count >= 20:
            continue
        
        for s in p.snippet_set.filter(FG=1).exclude(RE=1).exclude(converging_pipelines=1).order_by('-RE_score'):
            if s.FG==0 or s.RE_features<1:
                continue
            title_test_str = s.title.encode('utf-8')
            descr_test_str = s.description.encode('utf-8')
            #todo filtrar los snippets de acuerdo a las features que buscamos
            snippet_countries = []
            snippet_organizations = []
            orgs = OrganizationRegex.list_test(title_test_str)
            if len(orgs)>0:
                logging.debug ('FOUND ORGANIZATIONS ' +str(orgs)+' IN TITLE:\n ' +s.title)
                snippet_organizations += orgs
            orgs = OrganizationRegex.list_test(descr_test_str)
            if len(orgs)>0:
                logging.debug('FOUND ORGANIZATIONS ' +str(orgs)+ ' IN DESCR:\n ' +s.description)
                snippet_organizations += orgs
            accronyms = AccronymGazet.list_test(title_test_str)
            if len(accronyms)>0:
                logging.debug('FOUND ORG.ACCRONYMS ' +str(accronyms)+ ' IN TITLE:\n ' +s.title)
                snippet_organizations += accronyms
            accronyms = AccronymGazet.list_test(descr_test_str)
            if len(accronyms)>0:
                logging.debug('FOUND ORG.ACCRONYMS ' +str(accronyms)+ ' IN TITLE:\n ' +s.description)
                snippet_organizations += accronyms                
            countries = CountryGazet.list_test(title_test_str)
            if len(countries)>0:
                logging.debug ('FOUND COUNTRIES '+str(countries)+' IN TITLE:\n ' +s.title)
                snippet_countries += countries
            countries = CountryGazet.list_test(descr_test_str)
            if len(countries)>0:
                logging.debug('FOUND COUNTRIES '+str(countries)+' IN DESCR:\n ' +s.description)
                snippet_countries += countries
            countries = CountryGazetCase.list_test(title_test_str)
            if len(countries)>0:
                logging.debug ('FOUND COUNTRIES '+str(countries)+' IN TITLE:\n ' +s.title)
                snippet_countries += countries
            countries = CountryGazetCase.list_test(descr_test_str)
            if len(countries):
                logging.debug('FOUND COUNTRIES '+str(countries)+' IN DESCR:\n ' +s.description)
                snippet_countries += countries
            countries = CityGazet.list_test(title_test_str)
            if len(countries)>0:
                logging.debug('FOUND CITIES FROM COUNTRIES '+str(countries)+' IN CI TITLE\n' +s.title)
                snippet_countries += countries
            countries = CityGazet.list_test(descr_test_str)
            if len(countries)>0:
                logging.debug('FOUND CITIES FROM COUNTRIES '+str(countries)+' IN CI DESCRIPTION\n' +s.description)
                snippet_countries += countries
            countries = CityGazetCase.list_test(title_test_str)
            if len(countries)>0:
                logging.debug('FOUND CITIES FROM COUNTRIES '+str(countries)+' IN CD TITLE\n' +s.title)
                snippet_countries += countries        
            countries = CityGazetCase.list_test(descr_test_str)
            if len(countries)>0:
                logging.debug('FOUND CITIES FROM COUNTRIES '+str(countries)+' IN CD DESCRIPTION\n' +s.description)
                snippet_countries += countries
            person_countries += snippet_countries
            person_organizations += snippet_organizations
            top_snippets_count += 1
            if top_snippets_count in (5,10,15,20):
                dict_org = construye_dict_freq(person_organizations)
                dict_loc = construye_dict_freq(person_countries)
                list_org = dict_org.items()
                list_loc = dict_loc.items()
                sorted_list_org = sorted(list_org, key=lambda t:-t[1])
                sorted_list_loc = sorted(list_loc, key=lambda t:-t[1])
                organizations_str = ''
                locations_str = ''
                for e in sorted_list_org:
                    #logging.debug('type e='+str(type(e))+ ',type e[0]'+str(type(e[0]))+',e='+str(e))
                    organizations_str += e[0] + ' (' + str(e[1]) + ')\n' if len(e)>1 else ''
                for e in sorted_list_loc:
                    locations_str += e[0] + ' (' + str(e[1]) + ')\n' if len(e)>1 else ''

                logging.info('VÍNCULOS ORGANIZACIONES:: '+str(sorted_list_org))
                vinculo = Vinculo()
                vinculo.persona = p
                vinculo.organizaciones = organizations_str
                logging.info('VÍNCULOS LUGARES:: '+str(sorted_list_loc))
                vinculo.lugares = locations_str
                vinculo.descripcion = 'Top ' + str(top_snippets_count)
                vinculo.tipo = top_snippets_count
                vinculo.save()
        
            if top_snippets_count >= 20:
                break