Example #1
0
File: uflix.py Project: sg-s/uflix
	def resolve_name_from_imdb(self, name):
		'''resolves name from list of names using
		fuzzy string matching'''

		this_letter = name[0]
		a = bisect_left(self.imdb_movies,this_letter)
		next_letter = chr(ord(this_letter)+1)
		z = bisect_left(self.imdb_movies,next_letter)

		if a != 0 and z != 0 and z > a:
			imdb_name, score = process.extractOne(name,self.imdb_movies[a:z])

			# figure out the year too
			idx = self.imdb_movies.index(imdb_name)
			year = self.imdb_movies_year[idx]

			if score == 100:
				return (imdb_name, score, year)

		print("Could not get an exact match, will perform a full search...")

		imdb_name, score = process.extractOne(name,self.imdb_movies,scorer=fuzz.token_sort_ratio)

		# figure out the year too
		idx = self.imdb_movies.index(imdb_name)
		year = self.imdb_movies_year[idx]

		return (imdb_name, score, year)
Example #2
0
    def testWithScorer(self):
        choices = [
            "new york mets vs chicago cubs",
            "chicago cubs at new york mets",
            "atlanta braves vs pittsbugh pirates",
            "new york yankees vs boston red sox",
        ]

        choices_dict = {
            1: "new york mets vs chicago cubs",
            2: "chicago cubs vs chicago white sox",
            3: "philladelphia phillies vs atlanta braves",
            4: "braves vs mets",
        }

        # in this hypothetical example we care about ordering, so we use quick ratio
        query = "new york mets at chicago cubs"
        scorer = fuzz.QRatio

        # first, as an example, the normal way would select the "more
        # 'complete' match of choices[1]"

        best = process.extractOne(query, choices)
        self.assertEqual(best[0], choices[1])

        # now, use the custom scorer

        best = process.extractOne(query, choices, scorer=scorer)
        self.assertEqual(best[0], choices[0])

        best = process.extractOne(query, choices_dict)
        self.assertEqual(best[0], choices_dict[1])
    def map_all(self):
        '''
       Map_all has a confusing array of variables. 
       - "mapkeys" are the columns of a mapping file. these are boilerplate -- 'field name', 'source code', 'source value', etc.
       - "mapdict_of_element" is the "mapdict" attribute of each Element object. so for ihr.race.mapdict, ihr is the registry, race is the element, and mapdict is the dictionary of valueset value to target mapping.'
       - "mapdict_of_element_keys" is a conveninence list that contains the KEYS of mapdict_of_element SANS all nan values. nan values trip up the fuzzy matching algorithm (extractOne), and it is definitely more valuable ot have that algorithm."
        '''
        for x in self.mapmaster:
            if x[closest_match('field_name', self.mapkeys)] in self.regobject.elements:
                
                mapdict_of_element= getattr(getattr(self.regobject, x['field name']), 'mapdict')
                mapdict_of_element_keys = [x for x in mapdict_of_element.keys() if str(x) != 'nan']
                print(mapdict_of_element)
                self.mapmaster[0][closest_match('yes', self.mapkeys)]

                code = x[closest_match('source_code', self.mapkeys)]
                value = x[closest_match('source_value', self.mapkeys)]
                try:
                    if process.extractOne(str(code),  mapdict_of_element_keys)[1] > 50:
                        try:
                            mapdict_of_element[code] = x[closest_match('omop_concept_id', self.mapkeys)]
                        except: handle_it()
                    else:
                        if process.extractOne(str(value),  mapdict_of_element_keys)[1] > 50:
                            try:
                                mapdict_of_element[value] = x[closest_match('omop_concept_id', self.mapkeys)]
                            except: handle_it()
                        print(str(x['field name']) + ", " + str(code) + " cannot be mapped")
                except:
                    handle_it()
Example #4
0
def play_item(item, item_type=None):
    print('play_item')
    conf = settings.Config.get_config()
    if item_type == 'muzyka':
        albums, names = m3uparser.parseFolderForPlaylists(
            conf['media_dir'] + '/Music')
        load_best_playlist(albums, names, item)
    elif item_type == 'audiobook':
        albums, names = m3uparser.parseFolderForPlaylists(
            conf['media_dir'] + '/Audiobooks')
        load_best_playlist(albums, names, item)
    elif item_type == 'podcast':
        albums, names = m3uparser.parseFolderForPlaylists(
            conf['media_dir'] + '/Podcasts')
        load_best_playlist(albums, names, item)
    elif item_type == 'radio':
        tracks, titles = m3uparser.parseFolderForTracks(
            conf['media_dir'] + '/Radio')
        load_best_track(tracks, titles, item)
    else:
        # try to play without a type
        print(conf['media_dir'])
        tracks, titles = m3uparser.parseFolderForTracks(
            conf['media_dir'])
        albums, names = m3uparser.parseFolderForPlaylists(
            conf['media_dir'])
        title = process.extractOne(item, titles)
        print(str(title))
        name = process.extractOne(item, names)
        print(str(name))
        if title[1] > name[1]:
            load_best_track(tracks, titles, item)
        else:
            load_best_playlist(albums, names, item)
Example #5
0
  def decodeName(self, card):
    if card['set'] == 'Full Sets':
      return None
    name = card['desc']
    if card['set'] == 'Promos':
      if name.find('Baltimare') != -1 or name.find('SDCC') != -1:
        return None
      if name.find('Pre-Release') != -1:
        return None
      if name.startswith('Lady Justice Volunteer Promo'):
        return {'name': 'Lady Justice, Judge & Jury', 'id': 'pf16PR'}
    set = card['set']

    # Try to find the card ID. Should always be the last number.
    id = None
    try:
      id = re.findall('[FPfp]?[^ \t\n#0-9]?[0-9]{1,3}', name)[-1]
      name = name.replace(id, '')
      name = name.strip()
      if name[-1] == '-':
        name = name[:-1]
        name = name.strip()
    except:
      pass
    score = -1
    # Find the closest matching name. Things may be misspelled ;/
    fullname = None
    if set not in self.namesBySet:
      extract = process.extractOne(name, self.allNames)
      score = extract[1]
    else:
      extract = process.extractOne(name, self.namesBySet[set])
      score = extract[1]
    fullname = extract[0]
    return {'name': fullname, 'id': id}
Example #6
0
def preprocess_nonurban_text(text, intersections, threshold=80):
    text_new = text
    if 'צומת' in text:
        text_new = text.split('צומת')[1].strip()
        suspected_intersection = process.extractOne(text_new, intersections.intersection, scorer=fuzz.token_set_ratio,
                                                    score_cutoff=threshold)
        if suspected_intersection is None:
            text_new = text
    elif 'מחלף' in text:
        text_new = text.split('מחלף')[1].strip()
        suspected_intersection = process.extractOne(text_new, intersections.intersection, scorer=fuzz.token_set_ratio,
                                                    score_cutoff=threshold)
        if suspected_intersection is None:
            text_new = text
    elif 'כניסה ל' in text:
        text_new = text.split('כניסה ל')[1].strip()
        suspected_intersection = process.extractOne(text_new, intersections.intersection, scorer=fuzz.token_set_ratio,
                                                    score_cutoff=threshold)
        if suspected_intersection is None:
            text_new = text
    elif 'כביש' in text:
        text_new = text.split('כביש')[1].strip()
        suspected_intersection = process.extractOne(text_new, intersections.intersection, scorer=fuzz.token_set_ratio,
                                                    score_cutoff=threshold)
        if suspected_intersection is None:
            text_new = text
    return text_new
def mapTitle(title, pub_year):
    
    mtitles = []
    with open('/Users/petertamisin/demo/marvel_series_list.csv', 'rb') as csvfile:
        reader = csv.DictReader( csvfile )
        for line in reader:
            if (int(pub_year) >= int(line[ 'startyear' ]) and int(pub_year) <= int(line[ 'endyear' ])):        
                debug(4,line)
                # Remove Series Dates from title
                if (line['title'].endswith(')') and line['title'].rfind('(')>0) :
                    idx = line['title'].rfind('(')    
                    line['newtitle'] = line['title'][0:idx]
                    line['newtitle'] = line['newtitle'].rstrip() 
                debug(4, line['newtitle'])
                mtitles.append(line)

    match = (process.extractOne(title, sorted(set(d['newtitle'] for d in mtitles), reverse=True)))
    score = match[1]
    if score >= 90:
        for mline in mtitles:
            if(mline["newtitle"] == match[0]):
                debug(4,mline)
                return mline
    else:
        #try match again with translated title
        match = (process.extractOne(wordSwap(title), sorted(set(d['newtitle'] for d in mtitles), reverse=True)))
        score = match[1]
        if score >= 90:
            for mline in mtitles:
                if(mline["newtitle"] == match[0]):
                    debug(4,mline)
                    return mline
        
        debug(2, 'MISMATCH:' + ''.join(str(e) for e in match) + '~' + wordSwap(title))
        return 
Example #8
0
def predict_income(city, job, age, education, gender):

	city_names = city_name_map.keys()
	job_names = job_title_map.keys()
	
	standardized_city_name = process.extractOne(city, city_names)[0]
	standardized_job_name = process.extractOne(job, job_names)[0]

	given = [city_name_map[standardized_city_name], job_title_map[standardized_job_name], education]

	inputs = []
	for field in fields:
		if field in given:
			print field
			inputs.append(1)
		else:
			inputs.append(0)

	age = float(age)
	age_std = (age - 16) / (85 - 16)
	inputs[fields.index('age')] = age_std

	inputs[fields.index('female')] = gender_map[gender]

	return np.dot(inputs, coefficients) * 1.5
Example #9
0
def normalize_country(country):
    if process.extractBests(country, choices=['N/A', 'n/a'], score_cutoff=HIGH_CUTOFF):
        return None
    if country == 'null':
        return None
    if country in ['USA', 'UK']:
        return country
    if process.extractOne(country, choices=['United Kingdom', 'UK'], score_cutoff=HIGH_CUTOFF):
        return 'UK'
    if process.extractOne(country, choices=['England', 'Wales', 'Scotland'], score_cutoff=HIGH_CUTOFF):
        return 'UK'
    if process.extractOne(country, choices=['Russia', 'Russian Federation'], score_cutoff=LOW_CUTOFF):
        return 'Russia'
    try:
        country = pycountry.historic_countries.get(name=country.capitalize())
        return country.name
    except KeyError: pass
    
    try:
        country = pycountry.historic_countries.get(alpha2=country.upper())
        return country.name
    except KeyError: pass

    countries = [country.name for country in pycountry.countries]
    best_match = process.extractOne(country, choices=countries, score_cutoff=90)
    return best_match if best_match else country 
Example #10
0
def guess_column_names(columnname):
    ''' An attempt to standardize and rename column headers for manipulation later.
        Input: column name (String)
        Output: Corrected name (String)
    '''    
    #list of possible "correct" column headers 
    correct_headers = ['First Name', 'Last Name','Fullname','Student Name','Job Title', 'Title','ID',
                       'Institution','School','Company','Company Name1','Company Name2','Organization Name','Department','Division',
                       'Email Address','Street Address','Street 1','Dorm Address 1','Dorm Address 2','Dorm Address 3',
                       'Dorm Address 4','Address 1','Street 2','Address 2','Address','Street 3','Address 3','Street 4','Address 4',
                       'Work Street 1','Work Street 2','Work Street 2','Work Street 3','Work Street 4',
                       'Zipcode','Home Zipcode','Work_City','Dorm Postalplus4','HOME_FOREIGN_CITYZIP','WORK_FOREIGN_CITYZIP','Work_State','Work_Country',
                       'Postal','City','County','State','Country']
    
    # if column is exact match return name
    if columnname in correct_headers:  # might want to make this a dict for O(1) lookups
        return columnname#, 100
    
    # if column name is longer than 20 characters, return best quess based on last 15 characters
    if len(columnname) > 20:
        new_name, score = process.extractOne(columnname[-15:], correct_headers) 
        return new_name#, score
    
    # for all others, 
    else:
        new_name, score = process.extractOne(columnname, correct_headers)
        
    #if score > 80, return new_name 
    if score < 80:
        # returns orginal name if match is bad
        return columnname#, score
    else:
        return new_name#, score
Example #11
0
def fuzzThis(countries, choices):

		from fuzzywuzzy import fuzz
		from fuzzywuzzy import process

		for country in countries:
			print country
			print process.extractOne(country, choices)
Example #12
0
def find_string(string):
	with open('ratio.txt') as f:
		lines = f.read().splitlines()
	ratio = process.extractOne(string, lines)[0]
	with open('companies.txt') as f:
		lines = f.read().splitlines()
	company = process.extractOne(string, lines)[0]
	return ratio + " " + company
Example #13
0
    def match_move(self, char, move, vt, data):
        '''
        Main helper function that handles matching the move.
        Uses the reverse mapping of the common name, input command
        and short form converter to increase the chances of a better
        match.
        '''
        # First find the char they want.
        char_match, char_ratio = process.extractOne(char,
                                                    data.keys())
        if char_ratio < self.char_ratio_thresh:
            return False

        # They might have supplied the move name in shortened format
        # so convert it to how the frame data dump expects.
        result = re.search(self.short_regex, move)
        if result:
            matched = result.group(0)
            # Slice to the second last char because the matched move might
            # be 'cr. 'or 'cr ' but the  mapping only contains cr.
            move = re.sub(
                self.short_regex, self.short_mapping[matched[:-1]], move
            )

        # Use the reverse mapping to determine which move they
        # were looking for.
        moves = data[char_match]['reverse_mapping']
        move_match, move_ratio = process.extractOne(move, moves.keys())

        if move_ratio < self.move_ratio_thresh:
            return False

        move = data[char_match]['reverse_mapping'][move_match]

        # Check if the matched name was a char stat or a move.
        if 'char_stat' in move:
            return char_match, move_match, move
        else:
            # Find the move they want.
            if vt:
                # The move might not have any difference in vtrigger
                # so just return the normal version.
                try:
                    move_data = data[char_match]['moves'][self.vt_mappings[vt]][move]
                except KeyError:
                    move_data = data[char_match]['moves']['normal'][move]
            else:
                try:
                    move_data = data[char_match]['moves']['normal'][move]
                # Might be a vtrigger only move.
                except KeyError:
                    try:
                        move_data = data[char_match]['moves']['vtOne'][move]
                    except KeyError:
                        move_data = data[char_match]['moves']['vtTwo'][move]

            return char_match, move, move_data
Example #14
0
    def test_simplematch(self):
        basic_string = 'a, b'
        match_strings = ['a, b']

        result = process.extractOne(basic_string, match_strings, scorer=fuzz.ratio)
        part_result = process.extractOne(basic_string, match_strings, scorer=fuzz.partial_ratio)

        self.assertEqual(result, ('a, b', 100))
        self.assertEqual(part_result, ('a, b', 100))
Example #15
0
def fuzzy_match(inputlist, choices):
    if (isinstance(inputlist, str)):
        fz = process.extractOne(inputlist, choices)
        return(fz[0])
    else:
        outlist = []
        for s in inputlist:
            fz = process.extractOne(s, choices)
            outlist.append(fz[0])
        return(outlist)
def mapProjectsToLDAP(project_list, project_type, tenant_list=False):
    """Create a payload for ldap_updater module calls.

    Generate a list of dictionaries mapping Insightly properties to LDAP attributes.

    Args:
        project_list (List): A list of projects as JSON from Insightly to be converted into LDAP-like dictionaries.
        project_type (List): A description of the type of project, one of 'SDA', 'FPA' or 'FPA (CRA)'.
        tenant_list (List, optional): A list of tenants as JSON from Insightly,
            i.e. projects on the 'OpenStack Tenant' category.

    Returns:
        List: The project list converted into dictionaries with the relevant LDAP attributes, including nested tenants.
    """
    return map(lambda p: {'o': str(p['PROJECT_ID']),
                          'description': project_type,
                          'cn': sanitize(p['PROJECT_NAME']),
                          'owner': mapContactsToLDAP(filter(lambda owner: owner['CONTACT_ID'] in
                                                            map(lambda c: c['CONTACT_ID'],
                                                                filter(lambda o:
                                                                       o['CONTACT_ID'] is not None and
                                                                       extractOne(str(o['ROLE']),
                                                                                  TECH_ROLE,
                                                                                  score_cutoff=80),
                                                                       p['LINKS'])), USERS)
                                                     )[:1],
                          'seeAlso': mapContactsToLDAP(filter(lambda admin: admin['CONTACT_ID'] in
                                                              map(lambda c: c['CONTACT_ID'],
                                                                  filter(lambda a:
                                                                         a['CONTACT_ID'] is not None and
                                                                         extractOne(str(a['ROLE']),
                                                                                    ADMIN_ROLE,
                                                                                    score_cutoff=80),
                                                                         p['LINKS'])), USERS)
                                                       ),
                          'member': mapContactsToLDAP(filter(lambda member: member['CONTACT_ID'] in
                                                             map(lambda c: c['CONTACT_ID'],
                                                                 filter(lambda m:
                                                                        m[
                                                                            'CONTACT_ID'] is not None,
                                                                        p['LINKS'])), USERS)
                                                      ),
                          'tenants': mapProjectsToLDAP(filter(lambda t:
                                                              t['PROJECT_ID'] in
                                                              map(lambda sp:
                                                                  sp['SECOND_PROJECT_ID'],
                                                                  filter(lambda l:
                                                                         l[
                                                                             'SECOND_PROJECT_ID'] is not None,
                                                                         p['LINKS'])),
                                                              tenant_list),
                                                       project_type + [LU.OS_TENANT]) if tenant_list else [],
                          }, project_list) if project_list else []
Example #17
0
def get_fuzzy_player(player_name, wiki, real=False):
    """Get player id using fuzzy string match.
    Not available in UI mode."""
    if real:
        fishname, ratio = process.extractOne(player_name, wiki.keys())
        real_id = wiki.get(fishname)
        print u"It's {0}% {1}{2}{3}!".format(ratio, style.BOLD, real_id, style.END)
    else:
        fish_id, prob = process.extractOne(player_name, wiki.values())
        print 'It is {0}% {1}!'.format(prob, fish_id)
        print ''
        for k,v in wiki.items():
            if fuzz.ratio(v, fish_id) >= 85:
                print u'{0} --> {1}'.format(k, v)
    def _getLDAPCompatibleAccount(self, account):
        account = account.copy()
        account['objectClass'] = 'inetOrgPerson'
        if extractOne('True', account.pop('isHidden'), score_cutoff=75):
            account['employeeType'] = 'hidden'

        return account
	def main_parser(self,record,REFNO,TYPE,people,doctype):
		# if "Unidentified" not in people:
			# people.append("Unidentified")
		record = record.replace('=','')
		data = {'name':[],'role':[],'locations':[],'tracks':[],'TYPE':TYPE,'REFNO':REFNO}	
		if record in self.cache.keys():
			# print 'Found cached record: %s' % record
			data['name'] = self.cache[record]['name']
			data['role'] = self.cache[record]['role']
			data['locations'] = self.cache[record]['locations']
			data['tracks'] = self.cache[record]['tracks']
		elif len(people):
			data['name'] = fuzzyproc.extractOne(record,people)[0]
			tracks = re.findall('((?:A|B)\d+(\,*\s*(?:A|B)*\d+)*)',record)
			if len(tracks):
				for y in tracks[0][0].split(','):
					data['tracks'].append(y.strip())
				record = record.replace(tracks[0][0],'\n').strip()
			tokens = filter(lambda x: x not in data['name'].split(),[y.strip() for y in record.split(',')])
			for item in tokens:
				if item in self.roles:
					data['role'].append(item)
				elif item in self.locations:
					data['locations'].append(item)
			self.cache[record] = data
			# print 'Caching record: %s' % record
		else:
			# print 'Caching record: %s' % record
			self.cache[record] = data		
		data['role'] = list(set(data['role']))
		if len(data['role']) == 0 and TYPE == 'CREATOR' and doctype == 'Sound Recording':
			data['role'].append('performer')
		return data
Example #20
0
    def testNullStrings(self):
        choices = [None, "new york mets vs chicago cubs", "new york yankees vs boston red sox", None, None]

        query = "new york mets at chicago cubs"

        best = process.extractOne(query, choices)
        self.assertEqual(best[0], choices[1])
Example #21
0
    def process(service):
        """Parse text into commands."""
        text = service.data[ATTR_TEXT]
        match = REGEX_TURN_COMMAND.match(text)

        if not match:
            logger.error("Unable to process: %s", text)
            return

        name, command = match.groups()
        entities = {state.entity_id: state.name for state in hass.states.all()}
        entity_ids = fuzzyExtract.extractOne(name, entities,
                                             score_cutoff=65)[2]

        if not entity_ids:
            logger.error(
                "Could not find entity id %s from text %s", name, text)
            return

        if command == 'on':
            hass.services.call(core.DOMAIN, SERVICE_TURN_ON, {
                ATTR_ENTITY_ID: entity_ids,
            }, blocking=True)

        elif command == 'off':
            hass.services.call(core.DOMAIN, SERVICE_TURN_OFF, {
                ATTR_ENTITY_ID: entity_ids,
            }, blocking=True)

        else:
            logger.error('Got unsupported command %s from text %s',
                         command, text)
 def _disableAndNotify(self, dn, ldap_conn):
     account = ldap_conn.ldap_search(dn, _ldap.SCOPE_BASE, attrlist=['employeeType', 'cn', 'mail'])[0][1]
     if account and ('employeeType' not in account or not extractOne(account['employeeType'][0],
                                                                     ['disabled'], score_cutoff=80)):
         ldap_conn.ldap_update(dn, [(_ldap.MOD_REPLACE, 'employeeType', 'disabled')])
         map(lambda e: self.mailer.sendCannedMail(e, self.mailer.CANNED_MESSAGES['disabled_account'],
                                                  account['cn'][0]), account['mail'])
Example #23
0
def count_in_category(x='call_type', filter_dict=None, model=DEFAULT_MODEL, app=DEFAULT_APP, sort=True, limit=1000):
    """
    Count the number of records for each discrete (categorical) value of a field and return a dict of two lists, the field values and the counts.

    >>> x, y = count_in_category(x='call_type', filter_dict={'model__startswith': 'LC60'}, limit=5, sort=1)
    >>> len(x) == len(y) == 5
    True
    >>> y[1] >= y[0]
    True
    """
    sort = sort_prefix(sort)
    model = get_model(model, app)
    filter_dict = filter_dict or {}

    x = fuzzy.extractOne(str(x), model._meta.get_all_field_names())[0]    

    objects = model.objects.filter(**filter_dict)
    objects = objects.values(x)
    objects = objects.annotate(y=models.Count(x))
    if sort is not None:
        objects = objects.order_by(sort + 'y')
    objects = objects.all()
    if limit:
        objects = objects[:int(limit)]
    objects = normalize_choices(util.sod_transposed(objects), field_name=x, app=app, human_readable=True)
    if not objects:
        return None
    objects = consolidated_counts(objects, field_name=x, count_name='y')
    if sort is not None:
        objects = sorted_dict_of_lists(objects, field_names=['y', x], reverse=bool(sort))
    return objects[x], objects['y']
def getPersonsRef(person):

	match = process.extractOne(person, choices)
	if match[1] > 90:
		return match[0], personsDict[match[0]]['Person ID'], match[1]
	else:
		return match[0], personsDict[match[0]]['Person ID'], match[1], '-----------------!---MISSING---!!'
Example #25
0
def count_in_date(x='date_time', filter_dict=None, model=DEFAULT_MODEL, app=DEFAULT_APP, sort=True, limit=100000):
    """
    Count the number of records for each discrete (categorical) value of a field and return a dict of two lists, the field values and the counts.

    >>> from django.db import connection
    >>> connection.close()
    >>> x, y = count_in_date(x='date', filter_dict={'model__icontains': 'LC5'}, limit=5, sort=1)
    >>> len(x) == len(y) == 5
    True
    >>> y[1] >= y[0]
    True
    """
    sort = sort_prefix(sort)
    model = get_model(model, app)
    filter_dict = filter_dict or {}

    x = fuzzy.extractOne(str(x), model._meta.get_all_field_names())[0]    

    objects = model.objects.filter(**filter_dict)
    objects = objects.extra({'date_bin_for_counting': 'date(%s)' % x})
    objects = objects.values('date_bin_for_counting')
    objects = objects.annotate(count_of_records_per_date_bin=models.Count('pk'))
    
    # FIXME: this duplicates the dict of lists sort below
    if sort is not None:
        objects = objects.order_by(sort + 'date_bin_for_counting')
    objects = objects.all()
    if limit:
        objects = objects[:int(limit)]
    objects = util.sod_transposed(objects)
    if sort is not None:
        objects = sorted_dict_of_lists(objects, field_names=['count_of_records_per_date_bin', 'date_bin_for_counting'], reverse=bool(sort))
    #logger.info(x)
    return objects['date_bin_for_counting'], objects['count_of_records_per_date_bin']
 def location_guesses(self):
     db_locations = dict([(l.name, l) for l in models.Location.objects.all()])
     existing = self.progress["locations"]
     guesses = self.make_model_guesses(names=self.locations(), existing=existing, name_model=db_locations)
     db_methods = dict([(pm.planting_methods, pm) for pm in models.PlantingMethod.objects.all()])
     for guess in guesses:
         name = guess.name
         planting_methods = -1
         try:
             existing_name = existing[name]
         except (KeyError, TypeError):
             existing_name = {}
         try:
             planting_methods = existing_name["planting_methods"]
         except (KeyError, TypeError):
             planting_methods = -1
         if guess.ratio and guess.ratio > 98:
             try:
                 planting_methods = models.PlantingMethod.objects.get(planting_methods="").pk
             except:
                 planting_methods = -1
         elif db_methods:
             db_methodname, ratio = fuzz_process.extractOne(name, db_methods.keys())
             if ratio > 85:
                 planting_methods = db_methods[db_methodname].pk
         guess.set_extra({"planting_methods": planting_methods})
     return guesses
Example #27
0
def get_model(model=DEFAULT_MODEL, app=DEFAULT_APP):
    """
    >>> from django.db import connection
    >>> connection.close() 
    >>> get_model('WikiI').__name__.startswith('WikiItem')
    True
    >>> connection.close() 
    >>> isinstance(get_model('master'), models.base.ModelBase)
    True
    >>> connection.close() 
    >>> get_model(get_model('CaseMaster', DEFAULT_APP)).objects.count() >= 0
    True
    """
    # print 'get_model' + repr(model) + ' app ' + repr(app)
    if isinstance(model, models.base.ModelBase):
        return model
    app = get_app(app)
    try:
        model_object = models.get_model(app, model)
        if model_object:
            return model_object
    except:
        pass
    app = get_app(app)
    if not app:
        return None
    model_names = [mc.__name__ for mc in models.get_models(app)]
    if app and model and model_names:
        return models.get_model(app.__package__.split('.')[-1], fuzzy.extractOne(str(model), model_names)[0])
Example #28
0
    def nc_att_get(self, attribute, variable=None):
        """
        Get attribute from NetCDF file. Default is to find into global attributes.
        If attribute key is not found, get the closest key name instead.


        :param str attribute: The attribute key to get
        :param str variable: The variable from which to find the attribute. Global is None.
        :return: The attribute value
        :rtype: *str*

        """
        with ncopen(self.ffp) as nc:
            if variable:
                attrs = nc.variables[variable].__dict__
            else:
                attrs = nc.__dict__
            if attribute in attrs.keys():
                return attrs[attribute]
            else:
                try:
                    key, score = process.extractOne(attribute, attrs, scorer=fuzz.partial_ratio)
                    if score >= 80:
                        Print.warning('Consider "{}" attribute instead of "frequency"'.format(key))
                        return attrs(key)
                    else:
                        raise NoNetCDFAttribute(attribute, self.ffp)
                except:
                    raise NoNetCDFAttribute(attribute, self.ffp)
def match_players_official(dfm, daily_projections, official_ids=None):
    if not official_ids:
        official_ids = {}
    for player in daily_projections[yesterday_string]['Player'].unique():
        candidates = dfm['PLAYER_NAME'].tolist()
        official_ids[player] = process.extractOne(player, candidates)[0]
    return official_ids
    def make_model_guesses(self, names=None, existing=None, name_model=None):
        """
		names: list of names the user input
		existing: dictionary of {name: {pk: int}} that we've previously stored
		name_model: dictionary of {model.name: model} where model has member pk ('model.pk')
		returns a list of ModelGuess
		"""
        guesses = []
        for name in names:
            pk = -1
            try:
                existing_name = existing[name]
            except (KeyError, TypeError):
                existing_name = None
            if existing_name:
                try:
                    pk = existing_name["pk"]
                except:
                    pk = -1
            ratio = None
            if pk == -1 and name_model:
                db_name, ratio = fuzz_process.extractOne(name, name_model.keys())
                if ratio > 85:
                    pk = name_model[db_name].pk
            guesses.append(ModelGuess(name, pk, ratio))
        return guesses
Example #31
0
def get_citation_context(cits, sects, title2acl_ids, year2titles,
                         author_last2titles):
    cits_with_context = []  # (bib_idx, sect_context)

    for cit in cits:
        if cit['title'] is None or cit['book_title'] is None or cit[
                'date'] is None:
            continue

        # Find section context
        sect_contexts = []
        for context in cit['contexts']:
            for i, sect in enumerate(
                    sects):  # Try to find citation string in all sections
                if context.get('citStr') in sect['text']:
                    # found!
                    # print(sect['title'])
                    # print(sect['generic'])
                    sect_contexts.append((sect['generic'], sect['title'],
                                          context.get('citStr')))

            # print(context.get('citStr'))
            # print(context.get('position'))
            # print(context.get('startWordPos'))

        if len(sect_contexts) == 0:
            continue

        # Filter for ACL proceedings
        # TODO could be improved
        if 'ACL' in cit['book_title'] or 'Linguistics' in cit['book_title']:
            year_candidates = set(
                year2titles[cit['date']])  # papers from the same year

            if len(year_candidates) > 0:
                # papers from authors with same name
                # note: all name parts are used, bc we do not know what the first or last name is.
                author_names = [
                    name for author in cit['authors']
                    for name in author.split()
                ]
                author_candidates = []
                for name in author_names:
                    if name in author_last2titles:
                        author_candidates += author_last2titles[name]
                author_candidates = set(author_candidates)

                if len(author_candidates) > 0:
                    # candidate must be in both sets
                    candidates = year_candidates & author_candidates

                    if len(candidates) > 0:
                        match_title, score = process.extractOne(
                            cit['title'], candidates)

                        # Candidate must be above threshold
                        if score > .95 and match_title in title2acl_ids:
                            for acl_id in title2acl_ids[match_title]:
                                # Citation found in bib
                                for sc in sect_contexts:
                                    cits_with_context.append((acl_id, sc))

                # bib_candidates = process.extract(cit['title'], candidate_titles, limit=1)
                # for c_title, score in bib_candidates:
                #    for acl_id in title2acl_ids[c_title]:
                #        # Citation found in bib
                #        for sc in sect_contexts:
                #            cits_with_context.append((acl_id, sc))

                # TODO multi title matches? -> check for year

                # print(c_idx)
                # print(bib_database.entries[c_idx]['title'])
                # print(marker)
                #    break
    return cits_with_context
Example #32
0
def company_list(request):
    session = requests.Session()
    ## initializing the UserAgent object
    session.headers = {
        "User-Agent":
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
    }

    # creating our own def to parse urls
    def make_soup(url):
        ## getting the reponse from the page using get method of requests module
        page = session.get(url, verify=False, headers=session.headers)

        ## storing the content of the page in a variable
        html = page.content

        ## creating BeautifulSoup object
        soup = BeautifulSoup(html, "html.parser")

        return soup

    data_list = []
    url_collection = []
    html_table = []
    excel_link = []
    indexlink = []
    company_inf = []
    cik = []
    primarysymbol = []
    companyname = []
    markettier = []
    sicdescription = []

    if request.method == 'POST':

        try:
            ticker = request.POST['test']
            print(request.POST['test'])
        except:

            csv_file1 = request.FILES["csv_file1"]

            c = csv_file1.read().decode("utf-8")

            ticker = c.replace('\r', ",", (c.count('\r') - 1)).replace(
                '\n', "").replace("\r", "")

        #the api link provides meta data about companies like cik, ticker symbol, entity id or market tier.
        url = urllib.request.urlopen(
            'https://datafied.api.edgar-online.com/v2/companies?primarysymbols='
            + ticker + '&appkey=a76c61e85f9225192ce5cbbd0b22fb52').read()
        print(url)

        # converting JSON data to a dictionary
        list_of_data = json.loads(url)
        print(list_of_data)
        y = int(
            list_of_data['result']['totalrows']
        )  # find total number of the rows. if its 0, then ticker symbol doen't match with sec edgar db

        if y == 0:
            messages.success(
                request,
                "Unmatched Ticker Symbol or No Available Financial Data."
            )  #if it's 0, give error
            return redirect(
                'EaganJones:company_list')  #show error on search page
        # data for variable list_of_data
        for i in range(0, y):
            data = {
                "cik":
                str(list_of_data['result']['rows'][i]['values'][0]['value']),
                "companyname":
                str(list_of_data['result']['rows'][i]['values'][1]['value']),
                "entityid":
                str(list_of_data['result']['rows'][i]['values'][2]['value']),
                "primaryexchange":
                str(list_of_data['result']['rows'][i]['values'][3]['value']),
                "marketoperator":
                str(list_of_data['result']['rows'][i]['values'][4]['value']),
                "markettier":
                str(list_of_data['result']['rows'][i]['values'][5]['value']),
                "primarysymbol":
                str(list_of_data['result']['rows'][i]['values'][6]['value']),
                "siccode":
                str(list_of_data['result']['rows'][i]['values'][7]['value']),
                "sicdescription":
                str(list_of_data['result']['rows'][i]['values'][8]['value']),
            }

            companyname = data.get("companyname", "")
            cik = data.get("cik", "")
            primarysymbol = data.get("primarysymbol", "")
            markettier = data.get("markettier", "")
            sicdescription = data.get("sicdescription", "")

            data_list.append(
                data
            )  # all the data which came from edgar's api is in this list. We'll show those in the detail page.

        ticker_list = ticker.split(
            ","
        )  # split the tickers that user entered when searchirng for data.
        print("bu ticker list" + str(ticker.split(",")))
        for i in ticker_list:  #iter over ticker symbols and we're able to get each companies profile on sec edgar
            url2 = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=' + i + '&type=10-k&dateb=&owner=exclude&count=40'
            url_collection.append(
                url2
            )  # company profile links stored in a list. Our journey starts from this point.
            # We'll go through to the requested cash flows table step by step
            print("bunlar urller" + str(url2))
            print(url_collection)  # scrapping starts from this url collection

        b = []
        for z in url_collection:
            souped_link = make_soup(z)  #parse the link
            b.append(souped_link)
            print(b)
            return HttpResponse("hi")
            table = b.find("table", {"class": "tableFile2"})

            indexlink_list = [
            ]  # the links which contain 10-k filings will be in this link

            for row in table.find_all("tr"):
                cells = row.findAll("td")
                if len(
                        cells
                ) == 5:  # if len(cells) is not 5, it means the company registered to edgar's website but there no data. table is empty.
                    #I should write an error message here, I tried but failed.

                    if cells[0].text.strip(
                    ) == '10-K':  # make sure we are at the write row. when we search for 10-k, it pulss 10-ka too.

                        link = cells[1].find(
                            "a",
                            {"id": "documentsbutton"
                             })['href']  # get the link from documents button.
                        url = "https://www.sec.gov" + link
                        indexlink_list.append(url)
                        indexlink = indexlink_list[
                            0]  # get latest 10=k filing link
                        print(indexlink_list)

            souped_button = make_soup(
                indexlink)  # parse the link. we're so close to 10-k filing
            table2 = souped_button.find("div", {"id": "seriesDiv"})
            tables_page = "https://www.sec.gov" + table2.find("a")[
                "href"]  # get link from "interactive" button.

            souped_excel_button = make_soup(tables_page)
            excel_button = souped_excel_button.find("td").find_all(
                "a")[1]['href']
            excel_link = "https://www.sec.gov" + excel_button  #get excel link from "view excel document" button. this excel file includes all data from latest 10-k filing.
            print(excel_link)

            excel_sheet_name = pd.ExcelFile(
                excel_link
            ).sheet_names  #the problem with this excel file is, there are too much sheets.
            #the table we're looking for is sometimes named as "cash flows statements". sometimes "consolidated statements of cash"

            print(excel_sheet_name)

            choice_one = process.extractOne(
                "CASH FLOWS STATEMENTS", excel_sheet_name
            )  # use fuzzywuzzy libray and choose the highest score as sheetname
            choice_two = process.extractOne("CONSOLIDATED STATEMENTS OF CASH",
                                            excel_sheet_name)
            if choice_two[1] > choice_one[1]:
                cash_flows_sheet = choice_two[0]
            else:
                cash_flows_sheet = choice_one[
                    0]  # the table cash_flows_sheet is the sheet we're looking for.
                print(choice_one)
                print(choice_two)

            df = pd.read_excel(excel_link,
                               sheet_name=cash_flows_sheet,
                               na_filter=False)  #read the excel file
            print(df)

            html_table = df.to_html(index=False)  #store table as html

            json_table = df.to_json()  #store table as json.
            print(json_table)
            # it's time to store and display the data on our website
            rf = Companies.objects.get_or_create(cik=cik,
                                                 primarysymbol=primarysymbol,
                                                 companyname=companyname,
                                                 jsonnn=json_table,
                                                 table=html_table,
                                                 markettier=markettier,
                                                 sicdescription=sicdescription)
            #m2 = Companies(table=html_table, jsonnn=json_table, **data)

            print(rf)

            #for y in ticker_list:
            company_inf = Companies.objects.filter(
                primarysymbol__iexact=data.get("primarysymbol", ""))
            print(company_inf)

        context = {
            'data_list': data_list,
            'excel_link': excel_link,
            'html_table': html_table,
            'company_inf': company_inf
        }

        messages.success(request, "Data Parsed")
        return render(request, "company_list.html", context)

    else:
        Companies()
    return render(request, "company_list.html", {})
Example #33
0
def merge_nursinghome_data(nyt, hifld, manual_merge_tab):
    ''' merge nursing home data
    
    Parameters
    ----------
    nyt : cleaned nyt nursing homes data
    
    hifld : cleaned hifld nursing homes data
    
    manual_merge_tab : merge table with merges to manually correct
        
    Returns
    -------
    data frame with merged nursing homes data
    '''  
    
    try:
        from fuzzywuzzy import process, fuzz
    except ImportError:
        sys.exit("""You need fuzzywuzzy.
                    Install it from https://pypi.org/project/fuzzywuzzy/
                    or run pip install fuzzywuzzy.""")
    
    # clean names and cities for better merge
    nyt = clean_nh_cities(nyt)
    nyt = clean_nh_names(nyt, level = 1)
    nyt2 = clean_nh_names(copy.deepcopy(nyt), level = 2)  # more ambitious cleaning
    hifld = clean_nh_cities(hifld)
    hifld = clean_nh_names(hifld, level = 1)
    hifld2 = clean_nh_names(copy.deepcopy(hifld), level = 2)  # more ambitious cleaning
    
    # fuzzy merging
    matched_fid = []
    for i in range(nyt.shape[0]):
        name = nyt.loc[i, "Name"]
        name2 = nyt2.loc[i, "Name"]
        city = nyt.loc[i, "City"]
        state = nyt.loc[i, "State"]

        # get exact matches
        matched_all = hifld.loc[(hifld["Name"] == name) &\
                                (hifld["City"] == city) &\
                                (hifld["State"] == state)]

        if matched_all.shape[0] == 1:  # one exact match
            fid = matched_all.iloc[0]["Fid"]
        elif matched_all.shape[0] > 1:  # more than one exact match
            if matched_all.Name.iloc[0] == "CHRISTIAN HEALTH CARE CENTER":
                fid = 6658  # manual merge
            else:
                print("Multiple exact matches for: " + name)
        else:  # if no exact match, do fuzzy matching
            # first try exact matching on city and state
            hifld_matched = hifld.loc[(hifld["City"] == city) & (hifld["State"] == state)]
            if hifld_matched.shape[0] > 0:
                matched = process.extractOne(name, hifld_matched["Name"], scorer=fuzz.WRatio)
                if matched[1] >= 87:  # if meet threshold requirement, found match
                    matched_fids = hifld_matched.loc[hifld_matched["Name"] == matched[0]]
                else:  # try using names that are even more abbreviated/cleaned
                    hifld2_matched = hifld2.loc[(hifld2["City"] == city) & (hifld2["State"] == state)] 
                    matched = process.extractOne(name2, hifld2_matched["Name"], scorer=fuzz.WRatio)
                    if matched[1] >= 87:  # if meet threshold requirement, found match
                        matched_fids = hifld2_matched.loc[hifld2_matched["Name"] == matched[0]]
                    else:  # finally try using different distance metric
                        matched = process.extractOne(name2, hifld2_matched["Name"], scorer=fuzz.ratio)
                        matched_fids = hifld2_matched.loc[hifld2_matched["Name"] == matched[0]]

                # get (a single) matched FID
                if matched_fids.shape[0] == 1:
                    fid = matched_fids["Fid"].iloc[0]
                else:  # if multiple matched FIDs
                    if not matched_fids["Population"].isna().all() == 0:  # not all nans in pop field
                        # choose one with large population
                        fid = matched_fids.loc[matched_fids["Population"] ==\
                                               np.nanmax(matched_fids["Population"])]["Fid"].iloc[0]
                    else:  # all nans in population field
                        fid = matched_fids["Fid"].iloc[0]  # take first one
            else:  # do manual merge later
                fid = np.NaN
        matched_fid.append(fid)
    nyt["Matched FID"] = matched_fid
            
    # fix some nursing home matched FIDs manually
    for i in range(manual_merge_tab.shape[0]):
        name = manual_merge_tab.Name.iloc[i]
        fid = manual_merge_tab.FID.iloc[i]
        city = manual_merge_tab.City.iloc[i]
        state = manual_merge_tab.State.iloc[i]
        idx = (nyt.Name == name) & (nyt.City == city) & (nyt.State == state)
        nyt["Matched FID"].loc[idx] = fid
    
    # take entry with max #cases to deal with duplicates in nyt
    nyt_duplicated_ls = []
    for fid in nyt["Matched FID"].loc[nyt["Matched FID"].duplicated()].unique():
        if fid == -999:
            continue
        nyt_duplicated = nyt.loc[nyt["Matched FID"] == fid]
        nyt_duplicated = nyt_duplicated.loc[nyt_duplicated["Cases_2020-05-11"] ==\
                                            np.max(nyt_duplicated["Cases_2020-05-11"])]
        nyt_duplicated_ls.append(nyt_duplicated)
        nyt = nyt.loc[nyt["Matched FID"] != fid]
    nyt_duplicated = pd.concat(nyt_duplicated_ls, axis = 0, sort = False)
    nyt = pd.concat([nyt, nyt_duplicated], axis = 0, sort = False)

    #nyt.to_csv("full_merge_table.csv", index=False)
    
    # merge with hifld
    nyt["Matched FID"] = nyt["Matched FID"].astype(int)
    nyt = nyt.rename(columns = {"Name": "NYT Name", "City": "NYT City", "State": "NYT State"})
    nh = pd.merge(hifld, nyt, left_on = "Fid", right_on = "Matched FID", how = "right")
    nh = nh.replace(-999, np.NaN)

    return nh
Example #34
0
        "Armed Forces Europe": "AE",
        "PENNSYLVANIA": "PA",
        "OKLAHOMA": "OK",
        "KENTUCKY": "KY",
        "RHODE ISLAND": "RI",
        "DISTRICT OF COLUMBIA": "DC",
        "ARKANSAS": "AR",
        "MISSOURI": "MO",
        "TEXAS": "TX",
        "MAINE": "ME"
    }
    states = list(state_to_code.keys())
    print(fuzz.ratio('Python Package', 'PythonPackage'))
    print(process.extract('Mississippi', states))
    print(process.extract('Mississipi', states, limit=1))
    print(process.extractOne('Mississipi', states))
    data.apply(find_state_code, axis=1)

    print('Before Correct State:\n', data['state'])
    data['state'] = data.apply(correct_state, axis=1)
    print('After Correct State:\n', data['state'])
    data.insert(5, 'State Code', np.nan)
    data['State Code'] = data.apply(fill_state_code, axis=1)
    print(data)

    # group by
    print('==============group by================')
    print(data.groupby('State Code'))
    print('All Columns:\n')
    print(data.groupby('State Code').sum())
    print('Short Columns:\n')
    def compare_and_find_best_match(self, player_name, team_player_tag_objs,
                                    category, player_position):

        last_name = self._remove_start_end_commas(player_name.strip())

        jw_dist_dict = dict()
        for obj_player_name in team_player_tag_objs.keys():
            if len(obj_player_name.strip()) == 0:
                continue

            jw = distance.get_jaro_distance(last_name,
                                            obj_player_name,
                                            winkler=False,
                                            scaling=0.1)
            jw_dist_dict[obj_player_name] = jw

        sorted_jw_dist_dict = sorted(jw_dist_dict.items(),
                                     key=operator.itemgetter(1),
                                     reverse=True)
        if len(sorted_jw_dist_dict) > 0 and float(
                sorted_jw_dist_dict[0][1]) == 1.0:
            return sorted_jw_dist_dict[0][0]

        top_5_players = sorted_jw_dist_dict[:5]
        jw_list_above_75 = list()
        for top_5_player in top_5_players:
            if float(top_5_player[1]
                     ) > JW_NINTY_SCORE and self._compare_first_chars(
                         last_name, top_5_player[0]):
                return top_5_player[0]
            if float(top_5_player[1]) >= JW_ACCEPT_SCORE and self._compare_first_chars(last_name, top_5_player[0]) and \
                    self._compare_player_context(category, team_player_tag_objs.get(top_5_player[0]), player_position):
                jw_list_above_75.append(top_5_player)

        if len(jw_list_above_75) == 1:
            return jw_list_above_75[0][0]

        matches_best = process.extractBests(last_name,
                                            team_player_tag_objs.keys(),
                                            limit=5)
        if len(matches_best) > 0 and int(
                matches_best[0][1]) == FULL_100_PERCENT_MATCH:
            return matches_best[0][0]

        mb_list_above_90 = list()
        for match_best in matches_best:
            if match_best[1] >= NINTY_PERCENT_MATCH and self._compare_first_chars(last_name, match_best[0]) and \
                    self._compare_player_context(category, team_player_tag_objs.get(match_best[0]), player_position):
                mb_list_above_90.append(match_best)

        if len(mb_list_above_90) == 1:
            return mb_list_above_90[0][0]
        else:
            set_ratio_100_percent_list = list()
            set_ratio_90_percent_list = list()
            for mb_player in matches_best:
                set_ratio = fuzz.token_set_ratio(last_name, mb_player[0])
                if set_ratio == FULL_100_PERCENT_MATCH:
                    set_ratio_100_percent_list.append(mb_player[0])

                if set_ratio >= EIGHTY_FIVE_PERCENT_MATCH and self._compare_first_chars(
                        last_name, mb_player[0]):
                    matched_player = team_player_tag_objs.get(mb_player[0])
                    if matched_player.pos == 'NA' or player_position == 'NA' or matched_player.pos.lower() == player_position.lower() \
                        or self._is_positions_available_in_one_category(matched_player.pos, player_position):
                        set_ratio_90_percent_list.append(mb_player[0])

            if len(set_ratio_100_percent_list) == 1:
                return set_ratio_100_percent_list[0]
            elif len(set_ratio_100_percent_list) > 1:
                set_ratio_100_percent_context_list = list()
                for set_ratio_100_percent in set_ratio_100_percent_list:
                    matched_player = team_player_tag_objs.get(
                        set_ratio_100_percent)
                    if self._compare_first_chars(last_name, set_ratio_100_percent) and \
                        (matched_player.pos == 'NA' or player_position == 'NA' or matched_player.pos.lower() == player_position.lower()
                         or self._is_positions_available_in_one_category(matched_player.pos, player_position)):
                        set_ratio_100_percent_context_list.append(
                            set_ratio_100_percent)

                if len(set_ratio_100_percent_context_list) == 1:
                    return set_ratio_100_percent_context_list[0]

            if len(set_ratio_90_percent_list) > 0:
                return set_ratio_90_percent_list[0]

        double_metaphone_match = self._compare_double_metaphone(
            last_name, matches_best)
        if double_metaphone_match is not None:
            return double_metaphone_match

        match_one = process.extractOne(last_name, team_player_tag_objs.keys())
        if match_one is not None and match_one[
                1] >= NINTY_PERCENT_MATCH and self._compare_first_chars(
                    last_name, match_one[0]):
            return match_one[0]

        # To handle 'Rahming, T.J.' and 'RAHMING,TJ', because first one is giving 3 token and second is returning 2 tokens,
        # after sorting order is getting disturbed.
        exact_match = self._compare_exact_text_match(last_name, match_one)
        if exact_match:
            return match_one[0]

        final_match = self._compare_final_match(last_name, matches_best)
        if final_match is not None:
            return final_match

        return None
Example #36
0
def fuzzy_account(debug, dfl, dfr):
    msg_1 = 'Searching for FUZZY Matches...'
    msg_2 = '(Debug Mode Active)\n' if debug else '(Debug Mode Inactive)\n'
    print '%s\n%s\n%s' % (msg_1, '~' * len(msg_1), msg_2)
    dfl['NameAddress'] = dfl['NameStrip'] + ' ' + dfl['AddressStrip']
    dfr['NameAddress'] = dfr['NameStrip'] + ' ' + dfr['AddressStrip']
    matching_records = []

    for idx, row in dfl.iterrows():
        print idx

        status_i = AccountMatch('No Name', 'N/A', 0, 'No Address', 'N/A', 0, -1, 'Create New')
        rhs_i = dfr[dfr['Country'] == row.Country]

        # Search for Exact Name Match
        print 'Searching For Name Matches...'
        name_matches = rhs_i[rhs_i['NameStrip'] == row.NameStrip].reset_index(drop=True)

        # Exact Name Match(es) Found
        if not name_matches.empty:
            print '...%d Exact Name Match(es) Found:\n' % len(name_matches)
            # Select Closest Address Match
            match = process.extractOne(row.AddressStrip, name_matches.AddressStrip)
            name_matches = name_matches[name_matches.AddressStrip == match[0]].reset_index(drop=True)
            status_i.update_id_action(name_matches['Id'].loc[0], 'Verify')
            status_i.toggle('Address', 'Partial Address', match[0], match[1])
            status_i.toggle('Name', 'Exact Name', name_matches['NameStrip'].loc[0], 100)

        # Exact Name Match Not Found, Search for Exact Address Match
        else:
            msg_4 = '...No Name Matches, Trying Address...'
            print '%s\n%s' % (msg_4, '-' * len(msg_4))
            address_matches = rhs_i[rhs_i['AddressStrip'] == row.AddressStrip].reset_index(drop=True)

            # Exact Address Found
            if not address_matches.empty:
                print '...%d Exact Address Match(es) Found:\n' % len(address_matches)
                # Select Closest Name Match
                match = process.extractOne(row.NameStrip, address_matches.NameStrip)
                address_matches = address_matches[address_matches.NameStrip == match[0]].reset_index(drop=True)
                status_i.update_id_action(address_matches[address_matches.NameStrip == match[0]]['Id'].loc[0], 'Verify')
                status_i.toggle('Name', 'Partial Name', match[0], match[1])
                status_i.toggle('Address', 'Exact Address', address_matches['AddressStrip'].loc[0], 100)

            else:
                print 'Neither Name nor Address Found: '
                trigrams = [''.join(i) for i in find_ngrams(row.NameStrip, 3)]
                trigrams = [i.replace(' ', '').replace('+', '') for i in trigrams if len(i) > 2]
                print trigrams
                trigram_matches = rhs_i[rhs_i['NameStrip'].str.contains('|'.join(trigrams), na=False)]

                if not trigram_matches.empty:
                    print 'Trigram Matches Found'
                    trigram_matches['NameAddress'] = trigram_matches['NameStrip'] + ' ' + trigram_matches[
                        'AddressStrip']
                    print row.AddressStrip
                    best_match = process.extractOne(row.NameAddress, trigram_matches['NameAddress'])
                    best = trigram_matches[trigram_matches['NameAddress'] == best_match[0]].reset_index(drop=True)
                    status_i.update_id_action(best['Id'].loc[0], 'Verify')

                    best_name = process.extractOne(row.NameStrip, best['NameStrip'])
                    best_address = process.extractOne(row.AddressStrip, best['AddressStrip'])

                    print best_name
                    print best_address

                    status_i.toggle('Name', 'Partial Name', best_name[0], best_name[1])
                    status_i.toggle('Address', 'Partial Address', best_address[0], best_address[1])
                else:
                    print 'No Trigram Matches Found'
                    pass

        print row.Id, '|', row.NameStrip, '|', row.AddressStrip
        print status_i.id_best, '|', status_i.name_best, '|', status_i.address_best
        print status_i.name_status, status_i.name_prob, status_i.address_status, status_i.address_prob, '\n'
        matching_records.append([row.Id, status_i.id_best
                                    , row.NameStrip, status_i.name_best, status_i.name_status, status_i.name_prob
                                    , row.AddressStrip, status_i.address_best, status_i.address_status,
                                 status_i.address_prob])

    df = pd.DataFrame()
    df = df.append(matching_records)
    df.columns = ['Id_L', 'Id_R', 'Name_L', 'Name_R', 'NameStatus', 'NameProb', 'Address_L', 'Address_R',
                  'AddressStatus', 'AddressProb']
    return df
Example #37
0
import csv

with open("classnames.tsv") as f:
    reader = csv.reader(f, delimiter="\t")
    real_names = set(row[0] for row in reader)

sources = {
    "small-07": "124M-01-07-500.txt",
    "small-08": "124M-01-08-500.txt",
    "small-09": "124M-01-09-500.txt",
    "medium-07": "355M-01-07-1000.txt",
    "medium-08": "355M-01-08-1000.txt",
    "medium-09": "355M-01-09-1000.txt"
}

fake_names = {}

for source, file in sources.items():
    with open(file) as f:
        for name in [l.strip() for l in f.readlines()]:
            match, score = process.extractOne(name,
                                              real_names,
                                              scorer=fuzz.ratio)
            if score < 80:
                fake_names[name] = source

with open("fakeclasses.tsv", "w") as f:
    writer = csv.writer(f, delimiter="\t")
    for name, source in fake_names.items():
        writer.writerow([name, source])
Example #38
0
def recommender_final(movie_name):
    movie_list = []
    cos_sim_list = []

    #     movie_index = process.extractOne(movie_name, df_movies['title'])[2]

    full_movie_name = process.extractOne(movie_name, df_movies['title'])[0]

    # If movie index in movies_with_tags - do keyword recommendation
    if (movies_with_tags['title'] == full_movie_name).any():

        movie_index = process.extractOne(movie_name,
                                         movies_with_tags['title'])[2]

        # print(True)

        #     keyword_movie_index = process.extractOne(movie_name, movies_with_tags['title'])[2]
        #     keyword_full_movie_name = process.extractOne(movie_name, movies_with_tags['title'])[0]
        #     universe_movie_index = process.extractOne(movie_name, df_movies['title'])[2]
        #     universe_full_movie_name = process.extractOne(movie_name, df_movies['title'])[0]

        # go inside of Cosine_matrix and enumerate it
        # similar movies is list of are the indexes of similar movies inside the movies_with_tags table
        similar_movies = list(enumerate(cosine_sim[movie_index]))

        # Now we get the sorted list with most similar cosine similarity at top
        sorted_similar_movies = sorted(similar_movies,
                                       key=lambda x: x[1],
                                       reverse=True)

        # Return the first 6 movies in sort_similar_movies
        for element in sorted_similar_movies[0:6]:
            movie = get_title_from_index(element[0])
            movie_list.append(movie)
            sim_score = element[1]
            # convert score to percentage
            sim_score = sim_score * 100
            sim_score = round(sim_score, 2)
            cos_sim_list.append(sim_score)

        # the HTML takes the first movie from the list as "movie selected"
        # So we're adding it back here at the front after operations
        # The 1st recommendation will still be the first recommendation
        #movie_list.insert(0, full_movie_name)

        #         print('keyword_movie_index =', keyword_movie_index, '\n'
        #         'keyword_full_movie_name =', keyword_full_movie_name, '\n'
        #         'universe_movie_index =', universe_movie_index, '\n'
        #         'universe_full_movie_name =', universe_full_movie_name, '\n')

        results = [movie_list, cos_sim_list]
        return results

    # IF user searched movie NOT in our keyword table of movies
    # Then do a recommendation based on user ratings which is less accurate,
    # but has a larger universe of movies to get recommendations for
    else:

        movie_index = process.extractOne(movie_name, df_movies['title'])[2]

        distances, indices = model_knn.kneighbors(
            mat_movies_users[movie_index])

        # print('distances, indices = ', distances, indices)

        # print(indices[0][1])

        movie_rec_list = []
        distance_list = []

        for i in indices[0]:
            movie = (df_movies['title'][i])
            movie_rec_list.append(movie)

        # We only want the 5 scores returned
        for j in distances[0]:
            j = j * 100
            score = round(100 - j, 2)

            distance_list.append(score)

        results = [movie_rec_list, distance_list]
        return results
Example #39
0
def showmedia_site(message, app_string, site_string):
    log_message_env(message)
    if sdk.tenant_id:
        # get list of apps.
        appdef_n2id = idname.generate_appdefs_map(key_val='display_name',
                                                  value_val='id')
        app_list = appdef_n2id.keys()

        # get list of sites.
        sites_n2id = idname.generate_sites_map(key_val='name', value_val='id')
        site_list = sites_n2id.keys()

        # fuzzy match
        app_choice, app_percent = process.extractOne(app_string, app_list)
        site_choice, site_percent = process.extractOne(site_string, site_list)
        # perfect match, just get..
        if app_percent == 100 and site_percent == 100:
            message.react(GOOD_RESPONSE)
            app_id = appdef_n2id[app_choice]
            site_id = sites_n2id[site_choice]

        # good guess match..
        elif app_percent > 50 and site_percent > 50:
            message.react(GOOD_RESPONSE)
            message.reply(
                "I think you meant *{0}* at *{1}*, looking that up..".format(
                    app_choice, site_choice))
            app_id = appdef_n2id[app_choice]
            site_id = sites_n2id[site_choice]

        # if only one is good, or both are bad.
        else:
            message.react(BAD_RESPONSE)
            if app_percent <= 50:
                message.reply(
                    "I couldn't find a media application that matched what you asked for ({0}). "
                    "Try asking me about \"What apps are there?\".".format(
                        app_string))
            if site_percent <= 50:
                message.reply(
                    "I couldn't find a site that matched what you asked for ({0}). "
                    "Try asking me about \"What sites are there?\".".format(
                        site_string))
            return

        # Figure out the links/do all the work now.
        attachments = render_site_media_paths(app_id, site_id, sdk,
                                              global_id2n)

        # check if successful, add title
        if attachments[0].get('pretext') != "Sorry, couldn't query the media application info for this site at the " \
                                            "moment. Please try later.":
            message.reply("*Path status for {0} at {1}:*".format(
                app_choice, site_choice))

        # now, send it
        message.send_webapi('', json.dumps(attachments))

    else:
        message.react(BAD_RESPONSE)
        message.send(CGX_API_ERROR_MSG)
Example #40
0
    def _attach_legistar_details_to_event(
        event: Dict[str, Any],
        ignore_minutes_items: Optional[List[str]] = None
    ) -> Dict[str, Any]:
        """
        Query for and attach the best matching legistar event information to the provided event details.

        Parameters
        ----------
        event: Dict[str, Any]
            The parsed event details from the SeattleChannel website.
        ignore_minutes_items: Optional[List[str]]
            A list of minute item names to ignore when parsing the minutes items from legistar.
            Useful for minute items that are so commonly used they lack specific value.

        Returns
        -------
        joined: Dict[str, Any]
            The base event details object combined with the found legistar data.
        """
        # Get all legistar events surrounding the provided event date
        legistar_events = legistar_event_tools.get_legistar_events_for_timespan(
            "seattle",
            event["event_datetime"],
            event["event_datetime"] + timedelta(days=1)
        )
        log.debug("Pulled legistar details for event: {}".format(event["source_uri"]))

        # Fast return for only one event returned
        if len(legistar_events) == 1:
            selected_event = legistar_events[0]
        else:
            # Reduce events to not include cancelled events
            cancelled_reduced = [e for e in legistar_events if e["EventAgendaStatusName"] != "Cancelled"]

            # Get body names
            available_bodies = set([e["EventBodyName"] for e in cancelled_reduced])

            # Check if the Seattle Channel body name (basically a "display name") is present in the list
            # If so, choose the events with that exact body name
            if event["body"] in available_bodies:
                legistar_events = [e for e in cancelled_reduced if e["EventBodyName"] == event["body"]]
            # No exact match available, find the closest body name by text diff
            else:
                # Returns the closest name and the score that made it the closest
                closest_body_name, score = process.extractOne(event["body"], available_bodies)

                # For reasons somewhat unknown to me, SeattleChannel has videos for events that don't exist in legistar
                # We can somewhat detect this by filtering out body names that are drastically different
                # In the case that the closest body name is less than a 50% match, return None to be cleaned up after
                # The body names shouldn't be _that_ different which is why we are just ignoring for now
                if score < 50:
                    return None

                # Otherwise, use the found body name
                legistar_events = [e for e in cancelled_reduced if e["EventBodyName"] == closest_body_name]

            # Run agenda matching against the events
            agenda_match_details = legistar_event_tools.get_matching_legistar_event_by_minutes_match(
                event["minutes_items"],
                legistar_events
            )

            # Add the details
            selected_event = agenda_match_details.selected_event

        # Parse details
        if ignore_minutes_items is None:
            ignore_minutes_items = []
        parsed_details = legistar_event_tools.parse_legistar_event_details(selected_event, ignore_minutes_items)

        # Format the event details
        formatted_event_details = {
            **parsed_details,
            "source_uri": event["source_uri"],
            "video_uri": event["video_uri"],
            "caption_uri": event["caption_uri"]
        }
        log.debug("Attached legistar event details for event: {}".format(formatted_event_details["source_uri"]))
        return formatted_event_details
Example #41
0
    def enqueue_audio_playlist(self, arg):
        """Add all audio tracks in a Plex playlist to the playback queue.

        :param arg: a playlist search term

        """
        logging.info("arg : %s", arg)
        print_msg("[Plex] [Playlist search in server] : '{0}'. ".format(
            self.base_url))
        try:
            count = len(self.queue)
            playlist_title = ""
            playlist = None

            try:
                playlist = self._plex.playlist(title=arg)
                if playlist:
                    playlist_title = playlist.title
                    print_wrn("[Plex] Playing '{0}'.".format(playlist_title))
                    for item in list(playlist.items()):
                        if item.TYPE == "track":
                            track = item
                            track_info = TrackInfo(track, track.artist(),
                                                   track.album())
                            self._add_to_playback_queue(track_info)
                        if count == len(self.queue):
                            print_wrn(
                                "[Plex] '{0}' No audio tracks found.".format(
                                    playlist_title))
                            raise ValueError

            except (NotFound):
                pass

            if count == len(self.queue):
                playlist_dict = dict()
                playlist_titles = list()
                playlists = self._plex.playlists()
                for pl in playlists:
                    playlist_titles.append(pl.title)
                    playlist_dict[pl.title] = pl

                if len(playlist_titles) > 1:
                    playlist_title = process.extractOne(arg,
                                                        playlist_titles)[0]
                    playlist = playlist_dict[playlist_title]
                elif len(playlist_titles) == 1:
                    playlist_title = playlist_titles[0]
                    playlist = playlist_dict[playlist_title]

                if playlist:
                    print_adv("[Plex] '{0}' not found. "
                              "Playing '{1}' instead.".format(
                                  arg, playlist_title))
                    for item in list(playlist.items()):
                        if item.TYPE == "track":
                            track = item
                            track_info = TrackInfo(track, track.artist(),
                                                   track.album())
                            self._add_to_playback_queue(track_info)
                        if count == len(self.queue):
                            print_wrn(
                                "[Plex] '{0}' No audio tracks found.".format(
                                    playlist_title))

            self._finalise_play_queue(count, arg)

        except (ValueError, NotFound):
            raise ValueError(
                str("Playlist not found or no audio tracks in playlist : %s" %
                    arg))
Example #42
0
def fuzzyexact(df_left,
               df_right,
               id_col=None,
               key=None,
               block1=None,
               block2=None,
               threshold=80):
    '''Fuzzy match function which takes df1 as input and returns fuzzy matched items from df2'''

    #create key
    if len(key) == 4:
        df_left['key'] = df_left[key[0]].str.replace(
            ' ', '') + df_left[key[1]].str.replace(
                ' ', '') + df_left[key[2]].str.replace(
                    ' ', '') + df_left[key[3]].str.replace(' ', '')
        df_right['key'] = df_right[key[0]].str.replace(
            ' ', '') + df_right[key[1]].str.replace(
                ' ', '') + df_right[key[2]].str.replace(
                    ' ', '') + df_right[key[3]].str.replace(' ', '')
    elif len(key) == 3:
        df_left['key'] = df_left[key[0]].str.replace(
            ' ', '') + df_left[key[1]].str.replace(
                ' ', '') + df_left[key[2]].str.replace(' ', '')
        df_right['key'] = df_right[key[0]].str.replace(
            ' ', '') + df_right[key[1]].str.replace(
                ' ', '') + df_right[key[2]].str.replace(' ', '')
    elif len(key) == 2:
        df_left['key'] = df_left[key[0]].str.replace(
            ' ', '') + df_left[key[1]].str.replace(' ', '')
        df_right['key'] = df_right[key[0]].str.replace(
            ' ', '') + df_right[key[1]].str.replace(' ', '')
    elif len(key) == 1:
        df_left['key'] = df_left[key[0]].str.replace(' ', '')
        df_right['key'] = df_right[key[0]].str.replace(' ', '')

    #run fuzzy matching
    matched = {'Match': [], 'Score': []}

    for index, row in df_left.iterrows():

        if block1 is not None and block2 is not None:
            df_right_reduced = df_right[(df_right[block1] == row[block1])
                                        & (df_right[block2] == row[block2])]
        elif block1 is not None and block2 is None:
            df_right_reduced = df_right[(df_right[block1] == row[block1])]
        elif block1 is None and block2 is None:
            df_right_reduced = df_right.copy()

        if len(df_right_reduced.index) > 0:
            match = process.extractOne(row['key'],
                                       df_right_reduced['key'],
                                       score_cutoff=threshold)
            if match is not None:
                matched['Match'].append(match[0])
                matched['Score'].append(match[1])
            else:
                matched['Match'].append('')
                matched['Score'].append('')
        else:
            matched['Match'].append('')
            matched['Score'].append('')

    matched = pd.DataFrame(matched)

    finl = pd.concat([df_left, matched], axis=1)

    #append ID column from df_right to allow for easy lookup
    if id_col is not None:
        ids = df_right.copy()
        ids = ids[['key', id_col]]

        finl = finl.merge(ids,
                          left_on='Match',
                          right_on='key',
                          how='left',
                          suffixes=('', '_y'))
        finl.drop(['key_y'], axis=1, inplace=True)

    return finl
Example #43
0
 def guess_id(self, name):
     """@return: id, name, score"""
     name, score = process.extractOne(name, self._all_name_list)
     return self._roster[name], name, score
Example #44
0
def find_potential_checkouts_v2(df_chkout, stmt_amt, stmt_bank, stmt_desc):
    # Definitions:
    ################
    # _ab : subset of amt & bank
    # _abn: subset of amt, bank & exact name
    chkoutid, pmax_name, pmax_score, chkout_candidates, p0_names, p0_scores, p1_names, p1_scores, p2_names, p2_scores = None, None, None, None, None, None, None, None, None, None

    # Step 1:
    # Filter potential checkouts by proof amount & bank
    ################
    potential_chkouts_ab = df_chkout[
        (df_chkout['proof_amount'] == stmt_amt)
        & (df_chkout['[A] script_bank_cat'] == stmt_bank)]
    #
    # Step 2: Further filter potential checkouts if proof cust name is in description
    if len(potential_chkouts_ab.index) == 0:
        # Situation 1: No Amt Bank match
        chkoutid = 'Amount / Bank wrong'
        pmax_name, pmax_score, chkout_candidates, p0_names, p0_scores, p1_names, p1_scores, p2_names, p2_scores = None, None, None, None, None, None, None, None, None
    else:
        # Situation 2: Amt & Bank match, proceed to confirm using name (proof name ~ stmt desc)
        potential_chkouts_ab[
            '[B] proof_cust_name_clean'] = potential_chkouts_ab[
                '[B] proof_cust_name_clean'].fillna('').str.lower(
                ).str.replace(
                    '\"',
                    '')  # (1) fills na with un-matchable name (2) cleans it
        potential_chkouts_abn = potential_chkouts_ab[potential_chkouts_ab[
            '[B] proof_cust_name_clean'].map(lambda x: x in stmt_desc)]
        #
        if len(potential_chkouts_abn.index) == 1:
            # Situation 2a: Single match using Amt, Bank & exact Name
            chkoutid = potential_chkouts_abn['checkoutid'].item(
            )  # use subset of amount, bank & name
            pmax_name, pmax_score, chkout_candidates, p0_names, p0_scores, p1_names, p1_scores, p2_names, p2_scores = None, None, None, None, None, None, None, None, None
        elif len(potential_chkouts_abn.index) == 0:
            ########################
            ### WORK IN PROGRESS ###
            ########################
            # Situation 2b: (amt & bank --> some candidates, no exact match with name --> 2 options: possibility of approx match / no match at all)
            chkout_candidates = potential_chkouts_ab['checkoutid'].tolist(
            )  # check subset of amount & bank
            pmax = process.extractOne(
                stmt_desc,
                potential_chkouts_ab['[B] proof_cust_name_clean'].tolist(),
                scorer=fuzz.token_set_ratio,
                score_cutoff=50)
            if pmax is None:
                pmax_name, pmax_score = None, None
            else:
                pmax_name = pmax[0]
                pmax_score = pmax[1]
                try:
                    chkoutid = potential_chkouts_ab[
                        potential_chkouts_ab['[B] proof_cust_name_clean'] ==
                        str(pmax_name)]['checkoutid'].item()
                except ValueError:
                    chkoutid = None

            p0 = list(
                process.extractWithoutOrder(
                    stmt_desc,
                    potential_chkouts_ab['[B] proof_cust_name_clean'].tolist())
            )
            p0_names = [x[0] for x in p0]
            p0_scores = [x[1] for x in p0]
            p1 = list(
                process.extractWithoutOrder(
                    stmt_desc,
                    potential_chkouts_ab['[B] proof_cust_name_clean'].tolist(),
                    scorer=fuzz.token_sort_ratio)
            )  # Note: this is using token_sort_ratio
            p1_names = [x[0] for x in p1]
            p1_scores = [x[1] for x in p1]
            p2 = list(
                process.extractWithoutOrder(
                    stmt_desc,
                    potential_chkouts_ab['[B] proof_cust_name_clean'].tolist(),
                    scorer=fuzz.token_set_ratio)
            )  # Note: this is using token_set_ratio
            p2_names = [x[0] for x in p2]
            p2_scores = [x[1] for x in p2]
########################
### WORK IN PROGRESS ###
########################
        else:
            chkoutid = 'Many names found'
            pmax_name, pmax_score, chkout_candidates, p0_names, p0_scores, p1_names, p1_scores, p2_names, p2_scores = None, None, None, None, None, None, None, None, None
    #
    return (chkoutid, pmax_name, pmax_score, chkout_candidates, p0_names,
            p0_scores, p1_names, p1_scores, p2_names, p2_scores)
Example #45
0
print('string1 ="'+ string1+'"')
print('string2 ="'+ string2+'"')
print('string3 ="'+ string3+'"')
print('The difference between string1 and string2 is:', fuzz.ratio(string1, string2))
print('The difference between string2 and string1 is:', fuzz.ratio(string2, string3))
print('The difference between string2 and string 3 is:  ', fuzz.ratio(string2, string3))

print('The partial difference between string1 and string2 is:', fuzz.partial_ratio(string1, string2))
print('The partial difference between string2 and string1 is:', fuzz.partial_ratio(string2, string3))
print('The partial difference between string2 and string 3 is:  ', fuzz.partial_ratio(string2, string3))

print('Example 2 from datacamp.com:  ')
Str1 = "The supreme court case of Nixon vs The United States"
Str2 = "Nixon v. United States"
Ratio = fuzz.ratio(Str1.lower(),Str2.lower())
Partial_Ratio = fuzz.partial_ratio(Str1.lower(),Str2.lower())
Token_Sort_Ratio = fuzz.token_sort_ratio(Str1,Str2)
Token_Set_Ratio = fuzz.token_set_ratio(Str1,Str2)
print('simple ratio', Ratio)
print('partial ratio', Partial_Ratio)
print('sorted token ratio', Token_Sort_Ratio)
print('set token ratio', Token_Set_Ratio)

print('Example 3 for process module:  ')
str2Match = "apple inc"
strOptions = ["Apple Inc.","apple park","apple incorporated","iphone"]
Ratios = process.extract(str2Match,strOptions)
print(Ratios)
# You can also select the string with the highest matching percentage
highest = process.extractOne(str2Match,strOptions)
print(highest)
Example #46
0
def fuzzy(media, lib, scorer=fuzz.QRatio):
    """  Use Fuzzy Wuzzy to return highest scoring item. """
    if isinstance(lib, list) and len(lib) > 0:
        return fw.extractOne(media, lib, scorer=scorer)
    else:
        return ["", 0]
            attribute3 = rowlist[4]
            attribute2 = rowlist[3]

            if (attribute2 != " number"):
                #Switch with attribute 1
                #IF THERE IS AN EXACT MATCH
                for entity in entity_list:
                    if (attribute1 == entity):
                        continue
                    if (attribute3 == entity):
                        continue

                    else:

                        Ratios = process.extract(attribute1, entity_list)
                        highest = process.extractOne(attribute1, entity_list)
                        rowlist[2] = highest[0]
                        # print("MATCH", highest[0], attribute1)

                        # Ratios = process.extract(attribute2,entity_list)
                        # highest = process.extractOne(attribute2,entity_list)
                        # rowlist[3] = highest[0]
                        # print("MATCH", highest[0], attribute2)

                        Ratios = process.extract(attribute3, entity_list)
                        highest = process.extractOne(attribute3, entity_list)
                        rowlist[4] = highest[0]
                        # print("MATCH", highest[0], attribute3)

                    # i+=1
            print(rowlist)
Example #48
0
def lerArquivos(request):
    dominios = 'C:/xampp/htdocs/desafio_emails/domain_list.csv'
    emails = 'C:/xampp/htdocs/desafio_emails/email_list.csv'

    dadosDominios = []
    novoDadosDominios = []
    dadosEmails = []
    novosDadosEmails = []

    lista_dominios = pd.read_csv(dominios, names=['Domain'])
    lista_emails = pd.read_csv(emails, names=["E-mails"])
    total = lista_emails.shape[0]

    if request.POST['email'] != "":
        emailAdicionado = request.POST['email']
        lista_emails.loc[total+1] = emailAdicionado
        total = lista_emails.shape[0]
    else:
        emailAdicionado = ""

    for (i, row) in lista_dominios.itertuples():
        dadosDominios.append(row)

    for dominio in dadosDominios:
        itemDominio = dominio
        for y in ["'"]:
            item = itemDominio.replace(y, "")
            novoDadosDominios.append(item)

    for (j, linha) in lista_emails.itertuples():
        dadosEmails.append(linha)

    for emailList in dadosEmails:
        itemEmail = emailList
        for z in ["'"]:
            addEmail = itemEmail.replace(z, "")
            novosDadosEmails.append(addEmail)

    listEmailCerto = []
    listEmailErrado = []
    for i in novosDadosEmails:
        resultado = i.split("@")
        email = i
        if resultado[1] in novoDadosDominios:
            listEmailCerto.append(email)
        else:
            listEmailErrado.append(email)

    totalErrados = len(listEmailErrado)
    totalCertos = len(listEmailCerto)
    corrigidos = []
    gmail = [] #0
    hotmail = [] #1
    hotmailBr = [] #2
    hotmailMX = [] #3
    hotmailAr = [] #4
    msn = [] #5

    for i in listEmailErrado:
        resultadoErrado = i.split("@")
        z = process.extractOne(resultadoErrado[1], novoDadosDominios, scorer=fuzz.token_sort_ratio)
        resultadoErrado[1] = z[0]
        corrigidos.append(resultadoErrado[0] + '@' + resultadoErrado[1])
        if z[0] == 'gmail.com':
            gmail.append(resultadoErrado[0] + '@' + resultadoErrado[1])
        elif z[0] == 'hotmail.com.br':
            hotmailBr.append(resultadoErrado[0] + '@' + resultadoErrado[1])
        elif z[0] == 'hotmail.com.mx':
            hotmailMX.append(resultadoErrado[0] + '@' + resultadoErrado[1])
        elif z[0] == 'hotmail.com.ar':
            hotmailAr.append(resultadoErrado[0] + '@' + resultadoErrado[1])
        elif z[0] == 'msn.com':
            msn.append(resultadoErrado[0] + '@' + resultadoErrado[1])
        else:
            hotmail.append(resultadoErrado[0] + '@' + resultadoErrado[1])

    dictErrado = {}
    dictCerto ={}

    for emailCorrigido in corrigidos:
        login = emailCorrigido.split("@")
        tamLogin = len(login[0])
        dictErrado[login[0]] = [tamLogin, login[1]]
        login[1]=""

    dataFrameErrados = pd.DataFrame(data=dictErrado)
    datasetErrados = dataFrameErrados.T.reset_index()

    for (i, row) in datasetErrados[1].iteritems():
        if row == "gmail.com":
            datasetErrados[1][i] = 0;
        if row == "hotmail.com":
            datasetErrados[1][i] = 1;
        if row == "hotmail.com.br":
            datasetErrados[1][i] = 2;
        if row == "hotmail.com.mx":
            datasetErrados[1][i] = 3;
        if row == "hotmail.com.ar":
            datasetErrados[1][i] = 4;
        if row == "msn.com":
            datasetErrados[1][i] = 5;

    groupByErrados = datasetErrados.groupby([datasetErrados[0],datasetErrados[1]], as_index=False).size()

    print(groupByErrados)
    #exit()

    for emailCerto in listEmailCerto:
        loginCerto = emailCerto.split("@")
        tamLoginCerto = len(loginCerto[0])
        dictCerto[loginCerto[0]] = [tamLoginCerto, loginCerto[1]]
        loginCerto[1] = ""

    dataFrameCerto = pd.DataFrame(data=dictCerto)
    datasetCerto = dataFrameCerto.T.reset_index()

    for (a, rows) in datasetCerto[1].iteritems():
        if rows == "gmail.com":
            datasetCerto[1][a] = 0;
        if rows == "hotmail.com":
            datasetCerto[1][a] = 1;
        if rows == "hotmail.com.br":
            datasetCerto[1][a] = 2;
        if rows == "hotmail.com.mx":
            datasetCerto[1][a] = 3;
        if rows == "hotmail.com.ar":
            datasetCerto[1][a] = 4;
        if rows == "msn.com":
            datasetCerto[1][a] = 5;

    print(datasetCerto.groupby([datasetCerto[0], datasetCerto[1]]).size())

    domains = ['Gmail', 'Hotmail', 'HotmailBr', 'HotmailMx', 'HotmailAr', 'Msn']
    countDomains = [len(gmail),  len(hotmail), len(hotmailBr), len(hotmailMX), len(hotmailAr), len(msn)]
    descricaoX = 'Domínios'
    descricaoY = 'Quantidades de erros de escrita'

    #plotarGraficos(domains, countDomains, descricaoX, descricaoY)
    #plotarGraficos(dictCerto.values(), descricaoX,domains, 'Tamanho do login') 'countDomains': countDomains,'''

    countDomains = [len(gmail), len(hotmail), len(hotmailBr), len(hotmailMX), len(hotmailAr), len(msn)]

    return {'countDomains': countDomains,
                   'totalEmails': total,
                   'totalCertos':totalCertos,
                   'totalErrados': totalErrados,
                   'emailAdicionado': emailAdicionado}
Example #49
0
def sanitize_command(command):
    sanitized = process.extractOne(command, commands)
    if sanitized[1] == 0:
        sanitized = ''
    else:
        return sanitized[0]
Example #50
0
                    'milk', 'spaghetti', 'ramen', 'steak', 'drink', 'bread',
                    'potato', 'barbecue', 'wings', 'burrito', 'pasta', 'pizza',
                    'vegetable', 'burger', 'hot dog', 'chicken', 'fish', 'rice'
                ]
                values = [
                    40, 90, 80, 140, 40, 40, 70, 140, 115, 85, 90, 75, 50, 105,
                    115, 85, 120, 90
                ]
                items = {k: v for k, v in zip(keys, values)}

                # print the plaintext to screen for convenience
                i = len(resp['labelAnnotations']) - 1
                bestmatch = "dish"
                while (i >= 0):
                    t = resp['labelAnnotations'][i]
                    result = process.extractOne(t['description'], keys)
                    if result[1] >= 70:
                        bestmatch = result[0]
                    i = i - 1
                # print("They are ")
                first_string = bestmatch + ";"
                # print("time is")
                if bestmatch != "dish":
                    foodtime = items[bestmatch]
                else:
                    foodtime = 80
            #save the size response to a size.json file
            for idx, resp in enumerate(sizeresponse.json()['responses']):
                # save to JSON file
                imgname = image_filenames[idx]
                jpath = join(RESULTS_DIR, basename(imgname) + 'size' + '.json')
Example #51
0
def process_urban(text,
                  streets,
                  cities,
                  threshold_city=70,
                  threshold_street=50,
                  ratio=0.85):
    text = preprocess_urban_text(text, cities)
    suspected_city = process.extractOne(text,
                                        cities,
                                        scorer=fuzz.partial_ratio,
                                        score_cutoff=threshold_city)
    if suspected_city is not None:
        suspected_city = suspected_city[0]
        streets_in_city = streets.loc[streets.city == suspected_city]
        relevant_streets_1 = streets_in_city.loc[(streets_in_city.street1 !=
                                                  'NaN')].street1
        relevant_streets_2 = streets_in_city.loc[(streets_in_city.street2 !=
                                                  'NaN')].street2
        relevant_streets = relevant_streets_1.append(
            relevant_streets_2).drop_duplicates()
        relevant_streets_scores = relevant_streets.apply(
            lambda x: streets_in_city.loc[(streets_in_city.street1 == x) | (
                streets_in_city.street2 == x)].avg_accidents.max())
        relevant_streets = pd.DataFrame({
            'street':
            relevant_streets.tolist(),
            'avg_accidents':
            relevant_streets_scores.tolist()
        })
        suspected_streets = process.extract(
            text,
            list(set(relevant_streets.street.dropna().tolist())),
            scorer=fuzz.token_set_ratio,
            limit=3)
        if len(suspected_streets) > 0:
            relevant_streets_scores = relevant_streets.loc[
                relevant_streets.street.isin([
                    suspected_street[0]
                    for suspected_street in suspected_streets
                ])].copy()
            relevant_streets_scores.avg_accidents = (
                relevant_streets_scores.avg_accidents /
                relevant_streets_scores.avg_accidents.max()).copy()
            suspected_streets = [
                (suspected_street[0], (ratio * fuzz.token_set_ratio(
                    text, suspected_city[0] + ' ' + suspected_street[0])) +
                 ((1 - ratio) * 100 * relevant_streets_scores.loc[
                     relevant_streets_scores.street ==
                     suspected_street[0]].avg_accidents.iloc[0]))
                for suspected_street in suspected_streets if
                suspected_street is not None and (ratio * fuzz.token_set_ratio(
                    text, suspected_city[0] + ' ' + suspected_street[0])) +
                ((1 - ratio) * 100 * relevant_streets_scores.loc[
                    relevant_streets_scores.street == suspected_street[0]].
                 avg_accidents.iloc[0]) > threshold_street
            ]
        if len(suspected_streets) > 0:
            suspected_street = max(suspected_streets, key=lambda x: x[1])
            suspected_street = suspected_street[0]
            if suspected_street in streets_in_city.street1.tolist():
                suspected_street = streets_in_city.loc[
                    streets_in_city.street1 == suspected_street].iloc[0]
                return UrbanAddress(city=suspected_street.yishuv_name,
                                    street=suspected_street.street1_hebrew)
            else:
                suspected_street = streets_in_city.loc[
                    streets_in_city.street2 == suspected_street].iloc[0]
                return UrbanAddress(city=suspected_street.yishuv_name,
                                    street=suspected_street.street2_hebrew)
        return UrbanAddress(city=streets.loc[
            streets.city == suspected_city].yishuv_name.iloc[0])
    return None
Example #52
0
def fuzzy(sentence):
    str2match = sentence
    strOptions = words
    Ratios = process.extract(str2match, strOptions)
    highest = process.extractOne(str2match, strOptions)
    return highest[0]
Example #53
0
def get_stat_df(live_team, refresh):
    championship = live_team[1:live_team.find(']')]
    live_team = live_team[live_team.find(' ') + 1:].rstrip()
    championship_url = MATCHENDIRECT_URLS_DICT[championship]
    to_return1 = ([], [])
    to_return2 = ([], [])
    to_return3 = ''
    to_return4 = {}
    global x_y
    x_y = [[], []]
    text_to_add = ''
    game_score = ''
    good_game = None
    game_name = ''
    whole_df = []
    global final_time
    final_time = None
    for try_game in fetch_bet_urls(BET_URLS_DICT[championship]):
        if live_team in get_game_teams(try_game):
            good_game = try_game
            whole_df = get_odds(good_game)
            if whole_df.empty:
                whole_df.insert(loc=0,
                                column=get_game_name(good_game),
                                value='Côtes indisponibles',
                                allow_duplicates=True)
                text_to_add = " On ne trouve pas les côtes du match en question !"
                to_return1 = ([{
                    'name': col,
                    'id': col
                } for col in whole_df.columns], whole_df.to_dict('records'))
            else:
                whole_df.insert(loc=0,
                                column=get_game_name(good_game),
                                value='Côtes :',
                                allow_duplicates=True)
                to_return1 = ([{
                    'name': col,
                    'id': col
                } for col in whole_df.columns], whole_df.to_dict('records'))
                min_odd = whole_df.min(axis=1)
                print(min_odd.values)
    page = process_url(MATCHENDIRECT_URLS_DICT[championship] +
                       str(datetime.date.today().isocalendar()[0]) + '-' +
                       str(datetime.date.today().isocalendar()[1]) + '/')
    target_page = page.findAll('tr', {'class': 'sl'})
    url_list = []
    if len(target_page) > 1:
        for elem in target_page:
            url_list.append(elem.find('a', href=True)['href'])
            link = 'https://www.matchendirect.fr/' + process.extractOne(
                live_team, url_list)[0]
        try:
            whole_df2 = infos_game(link=link, to_csv=False)
        except:
            pass
        else:
            if whole_df2 is not None:
                [x_y[0], x_y[1], predicted_score,
                 final_time] = update_graph(whole_df,
                                            whole_df2,
                                            previous_x=[x_y[0]],
                                            previous_y=[x_y[1]],
                                            final_time=final_time)
                info_game = whole_df2.to_csv('file')
                to_return4 = {
                    'data': [{
                        'x': x_y[0],
                        'y': x_y[0]
                    }],
                    'layout': {
                        'title':
                        f"Notre prédiction pour {game_name} : score de {predicted_score}. <br> C'est un bon moment pour parier si {x_y[0][-1].values[0]} > 1 !"
                    }
                }
                to_return2 = ([{
                    'name': col,
                    'id': col
                } for col in whole_df2.columns], whole_df2.to_dict('records'))
                game_score = f" Le score est actuellement de {whole_df2.iloc[0]['Buts']} - {whole_df2.iloc[0]['Buts']} !\n"
    elif len(target_page) == 1:
        link = 'https://www.matchendirect.fr/' + page.find(('tr'), {
            'class': 'sl'
        }).find('a', href=True)['href']
        try:
            whole_df2 = infos_game(link=link, to_csv=False)
        except:
            pass
        else:
            if whole_df2 is not None:
                [x_y[0], x_y[1], predicted_score,
                 final_time] = update_graph(whole_df,
                                            whole_df2,
                                            previous_x=[x_y[0]],
                                            previous_y=[x_y[1]],
                                            final_time=final_time)
                info_game = whole_df2.to_csv('file')
                to_return4 = {
                    'data': [{
                        'x': x_y[0],
                        'y': x_y[0]
                    }],
                    'layout': {
                        'title':
                        f"Notre prédiction pour {game_name} : score de {predicted_score}. <br> C'est un bon moment pour parier si {x_y[0][-1].values[0]} > 1 !"
                    }
                }
                whole_df2.insert(loc=0,
                                 column=get_game_name(good_game),
                                 value=[
                                     get_game_teams(good_game)[0],
                                     get_game_teams(good_game)[1]
                                 ],
                                 allow_duplicates=True)
                to_return2 = (
                    [{
                        'name': col,
                        'id': col
                    } for col in whole_df2.columns],
                    whole_df2.to_dict('records'),
                )
                game_score = f" Le score est actuellement de {whole_df2.iloc[0]['Buts']} - {whole_df2.iloc[1]['Buts']} !\n"
    if good_game is not None:
        game_name = get_game_name(good_game)
        try:
            game_time = int(
                whole_df2.index.get_level_values("Minute").values[0][:-1])
        except:
            to_return3 = f"C'est la mi-temps du match {game_name} !\n" + game_score + " Regardez ce que recommande notre modèle..."
        else:
            to_return3 = f"C'est la {game_time}e minute du match {game_name} !\n" + game_score
            if game_time < 20:
                to_return3 += " Il est encore trop tôt pour prédire l'avenir..."
            elif game_time >= 89:
                to_return3 += " Il est trop tard pour aller parier sur Betclic !"
            else:
                to_return3 += " Regardez ce que recommande notre modèle..."
        to_return3 += text_to_add
        to_return1 = list(to_return1)
        to_return2 = list(to_return2)
    return (to_return1[0], to_return1[1], to_return2[0], to_return2[1],
            to_return3, to_return4)
        for i in lyrs:
            print i.rstrip("\n")
            if i.startswith("Root") and len(lyrs) > 1:
                lyrs.remove(i)

            elif i.startswith("Root") or i.startswith("UnmatchedService") and len(lyrs) == 1:
                url = i.split("|||")[-1].rstrip("\n")
                resource_name = os.path.split(f)[0].lstrip(START_PATH + os.path.sep)
                r = requests.get(url+"?f=json")

                if r.status_code == 200:
                    rj = r.json()

                    if rj.has_key("layers"):
                        layers_dict = {lyr["name"]: lyr["id"] for lyr in rj["layers"]}
                        match = process.extractOne(resource_name, layers_dict.keys())
                        print resource_name, match
                        go_ahead = raw_input("Match? (Y/N/O): ")

                        if go_ahead.lower() == "y":
                            lyr_index = layers_dict[match[0]]
                            lyrs.append("MatchedService|||" + url.rstrip("/") + "/" + str(lyr_index) +"\n")
                            lyrs.remove(i)
                        elif go_ahead.lower() == "o":
                            new_url = raw_input("Enter service url: ")
                            lyrs.append("MatchedService|||" + new_url + "\n")
                        else:
                            lyrs.append("UnmatchedService|||" + url.rstrip("/") + "\n")
                            lyrs.remove(i)

            elif i.startswith("UnmatchedService") and len(lyrs) == 2:
Example #55
0
def get_fuzzy_images(word: str):
    keywords = [k.keyword for k in db.query(Keyword).all()]
    keyword, score = processfuzz.extractOne(word, keywords)
    if score > 93:
        return keyword
    return None
Example #56
0
fwci_clean['surname'] = fwci_clean['Author'].str.split(',').str[0]
fwci_clean = fwci_clean.set_index('Author')
fwci_clean.drop(
    [col for col in fwci_clean.columns.tolist() if 'Unnamed' in str(col)],
    axis=1,
    inplace=True)
fwci_clean.dropna(axis=0, how='all', inplace=True)
fwci_clean.replace('-', np.nan, inplace=True)

retrieved_authors = fwci_clean.index.tolist()
surname_mapper = {}
for index in awardees.index.tolist():
    author = awardees.loc[index, 'name']
    author_surname = awardees.loc[index, 'surname']
    # process.extract(author, retrieved_authors)
    result, match = process.extractOne(author, retrieved_authors)
    result_surname = result.split(',')[0]
    if result_surname.lower() == author_surname.lower():
        surname_mapper[author] = result
    else:
        logger.info(f'No match found for {author}')

awardees['match_name'] = awardees['name'].map(surname_mapper)
len(awardees['match_name'].dropna()
    )  # approximately 88% of awardees had matched names!

# use matched name to find FWCI in the year of award from the fwci df
search_dict = dict(
    zip(
        awardees.dropna(how='any', subset=['match_name'])['match_name'],
        awardees.dropna(how='any', subset=['match_name'])['Year']))
Example #57
0
def process(source):
    """
    process(collector_input)

    Data process that:

     * Retrieve facet key, values pairs from file or directory attributes

    :param str source: The file full path to process or the dataset ID

    """
    # Get process content from process global env
    assert 'pctx' in globals().keys()
    pctx = globals()['pctx']
    # Block to avoid program stop if a thread fails
    try:
        if pctx.directory or pctx.dataset_id or pctx.dataset_list:
            # Get attributes from directory format or dataset_id format
            attributes = re.match(pctx.pattern, source).groupdict()
        else:
            # Get attributes from NetCDF global attributes
            attributes = dict()
            with ncopen(source) as nc:
                for attr in nc.ncattrs():
                    attributes[attr] = nc.getncattr(attr)
            # Get attributes from filename, overwriting existing ones
            match = re.search(pctx.pattern, source)
            if not match:
                raise ExpressionNotMatch(source, pctx.pattern)
            attributes.update(match.groupdict())
        # Get source values from attributes
        for facet in pctx.facets:
            if facet in pctx.set_keys.keys():
                try:
                    # Rename attribute key
                    attributes[facet] = attributes.pop(pctx.set_keys[facet])
                except KeyError:
                    raise NoNetCDFAttribute(pctx.set_keys[facet], source)
            elif facet in attributes.keys():
                # Facet exists in attribute keys
                pass
            else:
                # Find closest NetCDF attributes in terms of partial string comparison
                key, score = extractOne(facet,
                                        attributes.keys(),
                                        scorer=partial_ratio)
                if score >= 80:
                    # Rename attribute key
                    attributes[facet] = attributes.pop(key)
                    Print.debug(
                        'Consider "{}" attribute instead of "{}" facet'.format(
                            key, facet))
                else:
                    raise NoNetCDFAttribute(pctx.set_keys[facet], source)
            with pctx.lock:
                s = pctx.source_values[0]
                s[facet].add(attributes[facet])
                pctx.source_values[0] = s
        msg = TAGS.SUCCESS + 'Deserialize {}'.format(COLORS.HEADER(source))
        with pctx.lock:
            Print.info(msg)
        return 1
    except KeyboardInterrupt:
        raise
    except Exception:
        exc = traceback.format_exc().splitlines()
        msg = TAGS.FAIL + COLORS.HEADER(source) + '\n'
        msg += '\n'.join(exc)
        with pctx.lock:
            Print.exception(msg, buffer=True)
        return 0
    finally:
        with pctx.lock:
            pctx.progress.value += 1
            percentage = int(pctx.progress.value * 100 / pctx.nbsources)
            msg = COLORS.OKBLUE('\rHarvesting facets values from data: ')
            msg += '{}% | {}/{} {}'.format(percentage, pctx.progress.value,
                                           pctx.nbsources,
                                           SOURCE_TYPE[pctx.source_type])
            Print.progress(msg)
Example #58
0
def find_state_code(row):
    if row['state'] != 0:
        print(process.extractOne(row['state'], states, score_cutoff=80))
Example #59
0
def _get_best_fuzzy(text, sentences):
    from fuzzywuzzy import process

    return process.extractOne(text, sentences)
 def fix_selections(self, selections):
     runners = selections.values()
     self.home, _ = process.extractOne(self.home, runners)
     self.away, _ = process.extractOne(self.away, runners)
     self.update_price_selections(runners)