Ejemplo n.º 1
0
 def _get_best_match(self, company_name, df):
     similar = difflib.get_close_matches(company_name, 
                                         [name for name in df.company_name])
     if len(similar) and process.extract(company_name, similar)[0][1] > 80:
         zoominfo_profile_name = process.extract(company_name, similar)[0][0]
         for i, zoominfo_profile in df.iterrows():
             if zoominfo_profile['company_name'] == zoominfo_profile_name:
                 return zoominfo_profile.to_dict()
     return "not found"
def findSongsInMyCollection(reference_file, matched_store, toCheck):
    '''
        reference_file : the good-file prepared by dump_mp3_songs_info()
        matched_store  : json file having already mapped results
        toCheck        : json containing list of [{'TIT2','TALB'}] dicts

        returns (found/notfound) containing ({'TIT2','TALB'},path)
                        path is a single result in found
                             and a list(empty too) in not-found
    '''
    with open(matched_store,'r') as fd:
        matched_store_info = json.load(fd)
    reference_info = load_good_file_info(reference_file)
    all_reference_titles = reference_info['TIT2'].keys()
    all_reference_albums = reference_info['TALB'].keys()
    found = []
    notfound = []
    count = 0
    for track in toCheck:
        count += 1
        title = track['TIT2']
        album = track['TALB']
        if album in matched_store_info:
            if title in matched_store_info[album]:
                print ("{}. {}, {} is already matched".format(count, title, album))
                continue
        title_results = process.extract(title, all_reference_titles)
        albums_results = process.extract(album, all_reference_albums)

        title_results = [ i[0] for i in title_results if i[1] > 80 ]
        albums_results = [ i[0] for i in albums_results if i[1] > 80 ]

        all_paths = collections.defaultdict(int)
        for i in title_results:
            for path in reference_info['TIT2'][i]:
                all_paths[path] += 1
        for i in albums_results:
            for path in reference_info['TALB'][i]:
                all_paths[path] += 1

        if not all_paths:
            notfound.append((track, []))
            continue

        max_occurance = max(all_paths.values())
        max_keys = [ k for k in all_paths if all_paths[k] == max_occurance]

        if len(max_keys) == 1:
            print("{}. {},{} matched to {}".format(count, title, album, max_keys[0]))
            found.append((track,max_keys[0]))
        else:
            print("{}. {},{} didn't to {}".format(count, title, album, max_keys[0]))
            notfound.append((track, max_keys))
    return found,notfound
def extract_matches(site_description_only, site_name_only2):
    choices1 = site_description_only
    choices2 = site_name_only2

    #change the string to change the search criteria
    extractstring = "Javorski"
    print choices1
    print choices2
    print
    print

    fuzz_items1 = process.extract(extractstring, choices1, limit=2)
    fuzz_items2 = process.extract(extractstring, choices2, limit=1)
    
    print fuzz_items1[0]
    print fuzz_items2[0]
Ejemplo n.º 4
0
def nameVars(dat, name=None, numMatch=25):
    '''
    Get variants on a name by Levenshtein distance.
    dat = name data set
    name = name string
    numMatch = number of variants to return
    '''
    name = name[0].upper() + name[1:len(name)].lower()    
    names = dat.Name.unique()
    ndf = process.extract(name, names, limit = numMatch)
    ndf = pd.DataFrame(data = ndf, columns = ['Name', 'Sim'])
    
    nsum = dat.groupby(['Name','Gender'])['Count'].sum()
    
    dat = df.sort_values(['Name', 'Count'], ascending = [1, 0])
    dat = dat.groupby(['Name','Gender']).first()
    dat.drop('Id', axis=1, inplace=True)
    dat.columns = ['max_year', 'max_year_count']
    
    dat['total_count'] = nsum
    dat.reset_index(inplace=True)

    ndf = pd.merge(ndf, dat, on = 'Name', how= 'inner')

    return ndf 
Ejemplo n.º 5
0
    def get(self, input_statement, statement_list, current_conversation=None):
        """
        Takes a statement string and a list of statement strings.
        Returns the closest matching statement from the list.
        """

        # Check if the list is empty
        if not statement_list:
            raise EmptyDatasetException

        # Get the text of each statement
        text_of_all_statements = []
        for statement in statement_list:
            text_of_all_statements.append(statement.text)

        # Check if an exact match exists
        if input_statement.text in text_of_all_statements:
            return input_statement

        # Get the closest matching statement from the database
        closest_match = process.extract(
            input_statement.text,
            text_of_all_statements,
            limit=1
        )[0][0]

        return next((s for s in statement_list if s.text == closest_match), None)
Ejemplo n.º 6
0
    def fuzzy(self, tag, threshold=80):
        """Get a tuple of existing tags that fuzzily match a given one.

        Parameters
        ----------
        tags : str or list
            Tag or tags to get fuzzy matches for.
        threshold : int
            Lowest match score to return. Setting to 0 will return every tag,
            while setting to 100 will return only exact matches.

        Returns
        -------
        matches : tuple
            Tuple of tags that match.
        """
        if isinstance(tag, string_types):
            tags = [tag]
        else:
            tags = tag

        matches = []

        for tag in tags:
            matches += [i[0] for i in process.extract(tag, self, limit=None)
                        if i[1] > threshold]

        return tuple(matches)
Ejemplo n.º 7
0
    def get(self, input_statement, statement_list=None):
        """
        Takes a statement string and a list of statement strings.
        Returns the closest matching statement from the list.
        """
        statement_list = self.get_available_statements(statement_list)

        if not statement_list:
            if self.has_storage_context:
                # Use a randomly picked statement
                return 0, self.context.storage.get_random()
            else:
                raise EmptyDatasetException

        # Get the text of each statement
        text_of_all_statements = []
        for statement in statement_list:
            text_of_all_statements.append(statement.text)

        # Check if an exact match exists
        if input_statement.text in text_of_all_statements:
            return 1, input_statement

        # Get the closest matching statement from the database
        closest_match, confidence = process.extract(
            input_statement.text,
            text_of_all_statements,
            limit=1
        )[0]

        return confidence, next(
            (s for s in statement_list if s.text == closest_match), None
        )
Ejemplo n.º 8
0
def fuzzy_dropbox(project):
    directory_listing = os.listdir(dropbox_path)
    dir_temp = process.extract(project,directory_listing,limit=1)
    dropbox_output_directory = dir_temp[0]
    dropbox_output_directory = dropbox_output_directory[0]
    print 'This is the Dropbox directory name for that post share: ' + dropbox_output_directory
    return dropbox_output_directory
Ejemplo n.º 9
0
def fuzzy_project(project):
    directory_listing = os.listdir(server_directory)
    dir_temp = process.extract(project,directory_listing,limit=1)
    server_directory_exact = dir_temp[0]
    server_directory_exact = server_directory_exact[0]
    print 'This is the server directory name for that project: ' + server_directory_exact
    return server_directory_exact
Ejemplo n.º 10
0
    def speech(self, message):
        responses = {
            "Hello": ["Hi there!", "Hi!", "Welcome!", "Hello, {name}!"],
            "Hi there": ["Hello!", "Hello, {name}!", "Hi!", "Welcome!"],
            "Hi!": ["Hi there!", "Hello, {name}!", "Welcome!", "Hello!"],
            "Welcome": ["Hi there!", "Hi!", "Hello!", "Hello, {name}!", ],
            "How are you?": ["I'm fine!", "Status: Working...",
                             "I'm doing great."],
            "Good bye": ["Bye, {name}!"],
            "What time is it?": ["Adventure Time!", "{date} UTC"],
        }

        stickers = {
            "adventure_time": "BQADAgADeAcAAlOx9wOjY2jpAAHq9DUC",
        }
        leven = process.extract(message.get("text", ""),
                                responses.keys(), limit=1)[0]

        if leven[1] < 75:
            self.send(text="I can not understand you")
        else:
            response = choice(responses[leven[0]]).format(
                name=message['chat'].get("first_name", ""),
                date=time.ctime(int(message.get("date")))
            )

            if response == "Adventure Time!":
                self.send(sticker=stickers['adventure_time'])
            else:
                self.send(text=response)
Ejemplo n.º 11
0
	def findEligibleStops(self,searchTerm):
		possibleStops = []
		# Number matching
		if len(searchTerm) <= 4 and searchTerm.isdigit():
			stopID = "0" * (4 - len(searchTerm)) + str(searchTerm)

			if stopID in self.application.stopIDs:
				shortAddress = self.application.stopIDs[stopID]["Description"]
				possibleStops.append({"value":shortAddress,"id": stopID})
				print("number exists")
			else:
				print("Number doesnt exist")
		else:
			# Fuzzy matching
			fuzzySearchResults = process.extract(searchTerm, self.application.humanAddressList,limit=7)
			for humanAddress in fuzzySearchResults:
				stopObject = self.application.humanAddressDictionary[humanAddress[0]] # Takes the first element (second is match value)
				stopID = stopObject["Stop"]
				shortAddress = self.application.stopIDs[stopID]["Description"]
				possibleStops.append({"value":shortAddress,"id": stopID})

		if len(possibleStops) == 0:
			possibleStops.append({"value":"No stops found","id": "0000"})

		return possibleStops
def main():
	#get file names as inputs here later
	file = xlrd.open_workbook("WSnotof.xls")
	sheet = file.sheets()[0]
	WSnotof = sheet.col_values(2, 1)

	file = xlrd.open_workbook("Catalog.xlsx")
	sheet = file.sheets()[0]
	WScatalog = sheet.col_values(2, 1)

	new_dict = OrderedDict()
	#new_dict = dict()
	for ws in WSnotof:
		if not ws in new_dict:
			matchList = process.extract(ws, WScatalog, limit=3)
			new_dict[ws] = matchList

	file = Workbook()
	sheet = file.add_sheet('Map')
	
	row = 0
	for short_WS, matchList in new_dict.items():
		col = 1
		sheet.write(row, 0, short_WS)
		for match in matchList:
			sheet.write(row, col, match[0])			
			col = col + 1
			sheet.write(row, col, str(match[1]))
			col = col + 1
		row = row + 1 
	file.save('final_choices.xls')
	
	print("Finished!")
Ejemplo n.º 13
0
def get_filter_link(link_choice,goal=None,min_score=None,max_limit=4,type=0):
    """
    To get relevent link from list of link
    """
    if min_score:
        min_score = int(min_score)
    else:
        min_score = 60
    scored_link_list = []
    scored_link_list_raw = process.extract(goal,link_choice,limit=max_limit)
    logger.info("Score details for goal {0} with statistics {1}. minimum score {2}".format(goal,scored_link_list_raw,min_score))
    try:
        if scored_link_list_raw:
            for i in list(scored_link_list_raw):
                link = i[0]
                if int(type) != 1:
                    score = i[1]
                    if int(score) >= min_score:
                        scored_link_list.append(link)
                    logger.info("PARTIAL MATCH : Final score is {0} of url {1}  for goal {2}".format(score,link,goal))
                else:
                    score = fuzz.token_set_ratio(goal,link)
                    logger.info("EXACT MATCH : Final score is {0} of url {1}  for goal {2}".format(score,link,goal))
                    if int(score) >= min_score:
                        scored_link_list.append(link)
    except:
        logger.exception("Error occure in get_filter_link() function")
    return scored_link_list
Ejemplo n.º 14
0
    def landmark_check(self):
        tmp = self.item.strip()
        # Name standardization:
        tmp_list = re.sub('[' + string.punctuation + ']', '', tmp).split()
        std = StandardName(tmp_list, False).output
        # Don't match on 'the' if first word
        try:
            tmp = ' '.join(std[1:]) if std[0].upper() in ('THE', 'TEH') else ' '.join(std)
        except:
            tmp = tmp.upper()
        # Fuzzy matching:
        try:
            first_letter = tmp[0]
        except:
            first_letter = ''

        landmark_dict = self.list_landmarks(first_letter)
        landmark_list = [x for x in landmark_dict.keys()]
        results = process.extract(tmp, landmark_list, limit=3)
        results = sorted(results, key=lambda r: r[1], reverse=True)
        try:
            results = [] if results[0][1] == results[1][1] else results
            lname = results[0][0]
            landmark_addresses = landmark_dict[lname]
            # Currently only handle uniquely named landmarks
            # landmark_address = landmark_addresses[0] if results[0][1] > 89 and len(landmark_addresses) == 1 else ''
            landmark_address = landmark_addresses[0] if results[0][1] > 89 else ''
            self.is_landmark = True if landmark_address else False
            self.landmark_address = landmark_address
            self.landmark_name = lname
        except:
            pass
Ejemplo n.º 15
0
def results():
	if request.method == 'POST':
		qry = request.json["query"]
		print qry
		dbresults = []
		dbresults = filestable.query.filter_by(ownerhostel = current_user.hostel).search(unicode(qry)).all()
		i = 0
		print 'HELLO ' + str(dbresults)

		if len(dbresults) < 15:
			print "Inside"
			much = 15 - len(dbresults)
			dbresults  += filestable.query.search(unicode(qry)).limit(much).all()

		print len(dbresults)
		if len(dbresults) > 0:
			print 'dbresults is : ' + str(type(dbresults))
			print 'Type of dbresults[0] is : ' + str(type(dbresults[0]))
			dbresultsname = []
			print 'AND NOW ' + str(dbresults[0].ownerhostel)
			for i in xrange(0, len(dbresults)):
				print str(type(dbresults[i]))
				dbresultsname.append(str(dbresults[i].name.replace("_"," ")))
				i = i + 1

			fuzzyResults = process.extract(unicode(qry),dbresultsname,limit=5)
			print 'AND HI ' + str(fuzzyResults)
		else:
			fuzzyResults = ""
			print "Sorry No results"
		return jsonify(result = fuzzyResults)
	else:
		return redirect(url_for('search'))
Ejemplo n.º 16
0
def search(s_term='', s_users=False, s_groups=False, s_lists=False):
    # Get all user, group and list objects

    # Merge all user, group and list objects into one list
    all_search_obj = []
    if s_users:
        all_users = object_manager.allUsers()
        all_search_obj.extend(all_users)
    if s_groups:
        all_groups = object_manager.allPublicGroups()
        all_search_obj.extend(all_groups)
    if s_lists:
        all_lists = object_manager.getAllExerciseLists()
        all_search_obj.extend(all_lists)

    # Make a dict of the object with its seachString
    all_search = {obj: obj.searchString() for obj in all_search_obj}

    if s_term:
        # Fuzzy search
        results = process.extract(s_term, all_search, limit=10)

        # Search results have to have at least a 50% match
        filtered = [r[2] for r in reversed(sorted(results, key=lambda e: e[1])) if r[1] >= 80]

        return filtered

    else:
        return sorted(all_search_obj, key=lambda e: e.name())
Ejemplo n.º 17
0
def fuzzy_post_share(project,server_path):
    directory_listing = os.listdir(server_path)
    dir_temp = process.extract(project,directory_listing,limit=1)
    post_share_exact = dir_temp[0]
    post_share_exact = post_share_exact[0]
    print 'This is the server post share directory name for that project: ' + post_share_exact
    return post_share_exact
Ejemplo n.º 18
0
def fuzzy(project):
    direct_list = os.listdir(dropbox_path)
    dir_temp = process.extract(project,direct_list,limit=1)
    directory = dir_temp[0]
    directory = directory[0]
    print directory
    return directory
Ejemplo n.º 19
0
def matchHeard(heard, results, lookingFor='label'):
  located = None

  heard_minus_the = remove_the(heard)
  print 'Trying to match: ' + heard
  sys.stdout.flush()
  heard_list = set([x for x in heard.split() if x not in STOPWORDS])

  for result in results:
    # Strip out non-ascii symbols and lowercase it
    ascii_name = result[lookingFor].encode('ascii', 'replace')
    result_name = str(ascii_name).lower().translate(None, string.punctuation)

    # Direct comparison
    if heard == result_name:
      print 'Simple match on direct comparison'
      located = result
      break

    # Remove 'the'
    if remove_the(result_name) == heard_minus_the:
      print 'Simple match minus "the"'
      located = result
      break

    # Remove parentheses
    removed_paren = re.sub(r'\([^)]*\)', '', ascii_name).rstrip().lower().translate(None, string.punctuation)
    if heard == removed_paren:
      print 'Simple match minus parentheses'
      located = result
      break

  if not located:
    print 'Simple match failed, trying fuzzy match...'
    sys.stdout.flush()
    fuzzy_result = process.extract(str(heard), [d[lookingFor] for d in results], limit=1, scorer=fuzz.QRatio)
    if fuzzy_result[0][1] > 75:
      print 'Fuzzy match %s%%' % (fuzzy_result[0][1])
      located = (item for item in results if item[lookingFor] == fuzzy_result[0][0]).next()
    else:
      heard = replaceDigits(heard)
      fuzzy_result = process.extract(str(heard), [d[lookingFor] for d in results], limit=1, scorer=fuzz.QRatio)
      if fuzzy_result[0][1] > 75:
        print 'Fuzzy match %s%%' % (fuzzy_result[0][1])
        located = (item for item in results if item[lookingFor] == fuzzy_result[0][0]).next()

  return located
Ejemplo n.º 20
0
def give(user, amount, reason):
    app = _application_init()
    fullname = lambda u: " ".join([u[x] for x in ('first_name', 'last_name')])
    found = process.extract(user, app.settings.user_cache, processor=fullname)
    top= found[0][0]
    email = top['email']
    fullname = "%s %s" % (top['first_name'], top['last_name'])
    click.echo("Giving %s %s, is this correct?" % (fullname, amount))
Ejemplo n.º 21
0
 def busca(self, query):
     query = normaliza_string(query)
     amostra = self.cria_amostra()
     scorer = seleciona_scorer(query)
     resultado = process.extractBests(query, amostra, limit=10, scorer=scorer, score_cutoff=65)
     if scorer == fuzz.token_set_ratio:
         resultado = process.extract(query, lapida_extracao(resultado), limit=20, scorer=fuzz.partial_ratio)
     return lapida_extracao(resultado)
Ejemplo n.º 22
0
 def closest_locstrings(self, query, threshold=90, max_results=5):
     # list everything matching location
     with open(self.locstrings_filename) as csvfile:
         r = csv.reader(csvfile)
         result = process.extract(query, r,
                                  processor=lambda x: x[0],
                                  limit=max_results)
         return [(x[0][1], x[1]) for x in result if x[1] > threshold]
Ejemplo n.º 23
0
    def match_functions_argument_name(self):
        """ This function populates self.matched_functions_name with tuples of (stub_functions, match_ratio)
         ordered by match_ratio

        :return: None
        """
        self.matched_functions_name = \
            process.extract(self.task_comment, self.functions.keys(), limit=len(self.functions.keys()))
 def work_on_entry(final, toLook, refList, refMainDict):
     if toLook:
         results = process.extract(toLook,refList)
         results = [ i[0] for i in results if i[1] > 70 ]
         for i in results:
             for path in refMainDict[i]:
                 final[path] += 1
     return None
def main():
	switch = 1

	while(switch == 1):
		usercommand = input()
		choices = ["wash dishes", "making bed", "taking out trash", "vacuuming", "cooking food", "doing laundry", "dusting", "lawn mowing"]
		print(process.extract(usercommand, choices, limit=8))
		results = process.extract(usercommand, choices, limit=8)
		match = results[0]
		print("\n")
		print("Your input matches this command:")
		print(match[0])
		print("\n" + "Exit program? (y / n)")
		userexit = raw_input()	
		if userexit == "y":
			switch = 0

	return 
Ejemplo n.º 26
0
def get_closest_possible_matches(input_arg):
    VALID_MATCH_PERCENTAGE = 60
    MATCH_PERCENTAGE_INDEX = 1
    MATCH_FUNC_INDEX = 0
    available_matches = dispatch_func.keys()
    possible_matches = process.extract(input_arg, available_matches)

    indexed_matches_dict = {index + 1: entry[MATCH_FUNC_INDEX] for index, entry in enumerate(possible_matches) if entry[MATCH_PERCENTAGE_INDEX] > VALID_MATCH_PERCENTAGE}
    return indexed_matches_dict
Ejemplo n.º 27
0
 def items(self, name):
     """
     Bla bla bla, get all items
     """
     data = process.extract(
         name,
         self._items.keys()
     )
     return data
Ejemplo n.º 28
0
 def find_movie(self, query):
     """
     Find the movie in the library that best matches the given query.
     :param query: Query to search for in the library
     :return: The name of the movie whose title best matches the given query
     :rtype: str
     """
     results = process.extract(query, self.list_movies(), limit=1)
     return results[0][0]
Ejemplo n.º 29
0
def get_instructors(search_string):
    instructor_names = [instructor.name for instructor in Instructor.query.all()]
    instructors = [
        instructor for instructor in process.extract(search_string, instructor_names, limit=100) if instructor[1] > 60
    ]
    instructor_data = [
        get_less_instructor_json(Instructor.query.filter_by(name=instructor[0]).first()) for instructor in instructors
    ]
    return json_response({"status": "success", "data": instructor_data}, 200)
def username_lookup(username):
    if not wf.cached_data_fresh('usernameList', max_age=UPDATE_INTERVAL) or wf.cached_data('hostIdList') is None:
        main.update_caches()
    match_found = localCache.get(str(username))
    if match_found < 1:
        results = process.extract(username, localCache.keys(), limit=3)
        fuzzy_match(results, username)
    else:
        exact_match(match_found, username)
Ejemplo n.º 31
0
 #print(delimitedlocation[authors][location])
 countrytext = delimitedlocation[authors][location]
 if countrytext == 'NA':
     countrydata.append('NA')
     citystatedata.append('NA')
     break
 if '(United States)' in countrytext:
     country = 'United States'
     citystate = countrytext.replace('(United States)', '')
     citystate = citystate.replace(',', '')
 elif '(France)' in countrytext:
     country = 'France'
     citystate = countrytext.replace('(France)', '')
     citystate = citystate.replace(',', '')
 else:
     country1, score, index = process.extract(
         delimitedlocation[authors][location], countrylist, limit=1)[0]
     if country1 in delimitedlocation[authors][location]:
         country = country1
     else:
         country = 'No Country'
     citystate = countrytext.replace(country1, '')
     countrycap = country1.upper()
     citystate = citystate.replace(countrycap, '')
     citystate = citystate.replace(',', '')
     citystate = citystate.replace('-', '')
     citystate = citystate.lower()
     citystate = citystate.title()
 #data = process.extract(delimitedlocation[authors][location], combinedcitystatelist, limit=5)
 #citystates = []
 #for i in range(5):
 #    citystates.append(data[i][0])
Ejemplo n.º 32
0
ROM    2
"""

fileName = "stringMatch"
df = pd.read_csv("./data/" + fileName + ".txt", sep='\t', header=0)
df["strlen"] = df["str"].str.len()
df = df[df["strlen"] > 1]
dftopList = df[
    df["counts"] >
    4]  # If you don't have a count of the words then you can comment this line
dftopListWords = dftopList["str"].tolist()
words = df["str"].tolist()
i = 0
matchAnalysis = []
for word in dftopListWords:
    for matchWord, matchval in process.extract(word, words, limit=20):
        if word != matchWord:
            levnDist = jellyfish.levenshtein_distance(word, matchWord)
            seqMatch = round(
                SequenceMatcher(None, word, matchWord).ratio() * 100)
            # fuzz.token_sort_ratio
            matchAnalysis.append(
                [word, matchWord, matchval, levnDist, seqMatch,
                 len(word)])

    # if i == 2:
    #     break
    # else:
    #     i += 1
else:
    dfMatch = pd.DataFrame.from_records(matchAnalysis,
Ejemplo n.º 33
0
import os

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "geonode.settings")

# start of fuzzy look up
from geodb.models import AfgPplp, Glofasintegrated
from fuzzywuzzy import process
import datetime

# f = AfgPplp.objects.all().filter(dist_na_en='Qala-e-Naw').values('name_en','dist_na_en','prov_na_en')
f = AfgPplp.objects.all().values('name_en', 'dist_na_en', 'prov_na_en')

choices = []
for i in f:
    choices.append(i['name_en'].lstrip() + ';' + i['dist_na_en'] + ';' +
                   i['prov_na_en'])

x = process.extract("BAGHBAN HA;Qala-I- Naw;Badghis", choices)

t = 1
for i in x:
    print t, i
    t = t + 1

# end of fuzzy look up
Ejemplo n.º 34
0
def fetch_youtube_results(searchterm):
    # searchterm = request.args.get('searchterm')
    matches = [a for a, b in process.extract(searchterm, yt['title'].tolist())]
    results = yt.loc[yt['title'].isin(matches)]
    return results.reset_index(drop=True).T.to_dict()
Ejemplo n.º 35
0
def get_matches(query, choices, limit = 1):
  results = process.extract(query, choices, limit=limit)
  return results
Ejemplo n.º 36
0
    def _email_search(self, email, api_key=""):
        try:
            person = clearbit.Person.find(email=email, stream=True)
        except:
            person = None
        data = {
            "pattern": None,
            "name": None,
            "email": email,
            "domain": email.split("@")[-1],
            "crawl_source": "email_hunter"
        }
        if person:
            pattern = EmailGuessHelper()._find_email_pattern(
                person["name"]["fullName"], email)
            if pattern:
                data = {
                    "pattern": pattern,
                    "name": person["name"]["fullName"],
                    "email": email,
                    "domain": email.split("@")[-1],
                    "crawl_source": "email_hunter"
                }
        elif not person or not pattern:
            person = FullContact()._person_from_email(email)
            print person
            try:
                person = person["contactInfo"]["fullName"]
                fullcontact_person = True
            except:
                fullcontact_person = False

            if fullcontact_person:
                person = person["contactInfo"]["fullName"]
                pattern = EmailGuessHelper()._find_email_pattern(person, email)
                data = {
                    "pattern": pattern,
                    "name": person,
                    "email": email,
                    "domain": email.split("@")[-1],
                    "crawl_source": "email_hunter"
                }
                print pattern
            else:
                _email = email.replace(".",
                                       " ").replace("-",
                                                    " ").replace("_", " ")
                _email = _email.replace("@", " ")
                g = Google().search("{0} site:linkedin.com/pub".format(_email))
                g1 = Google().search("{0} site:linkedin.com/pub".format(
                    _email.split(" "[0])))
                g2 = Google().search(
                    "{0} site:linkedin.com/pub".format(_email).split(" ")[-1])
                g = pd.concat([g, g1, g2])
                choices = [i.split(" |")[0] for i in g.link_text]
                person = process.extract(_email, choices, limit=1)
                try:
                    person = person[0][0]
                except:
                    ''' '''
                pattern = EmailGuessHelper()._find_email_pattern(person, email)
                print "google search pattern", pattern
                if pattern:
                    data = {
                        "pattern": pattern,
                        "name": person,
                        "email": email,
                        "domain": email.split("@")[-1],
                        "crawl_source": "email_hunter"
                    }
                else:
                    data = {
                        "pattern": None,
                        "name": None,
                        "email": email,
                        "domain": email.split("@")[-1],
                        "crawl_source": "email_hunter"
                    }
        #data = pd.DataFrame([data])
        conn = r.connect(host="localhost", port=28015, db="triggeriq")
        r.table('email_pattern_crawls').insert(data).run(conn)
        #CompanyEmailPatternCrawl()._persist(data, "emailhunter", api_key)
        # persist to rethinkdb
        print "person", person
Ejemplo n.º 37
0
def movie_bot_final(title):
    form = SearchForm(request.form)
    # IDENTIFY THE TITLE THAT WAS PASSED IN
    titleloc = movies.loc[movies['tmdbId'] == int(title)]
    movieTitle = titleloc['title'].iloc[0]
    # GET THE DESCRIPTION OF THE MOVIE THAT WAS PASSED IN
    tmdb_desc = requests.get(f'https://api.themoviedb.org/3/movie/{title}?api_key={api_key}')
    desc_data = tmdb_desc.json()
    if desc_data.get("overview") != None:
        description = desc_data['overview']
    else:
        pass
    # GET THE YOUTUBE TRAILER LINK FOR THE ID THAT WAS PASSED IN
    tmdb_trailer = requests.get(f'https://api.themoviedb.org/3/movie/{title}/videos?api_key={api_key}')
    trailer_response = tmdb_trailer.json()['results']
    if not trailer_response:
        trailer_url = 'None'
    else:
        trailer_data = trailer_response[0].get('key')
        trailer_path = trailer_data
        trailer_url = (f'https://www.youtube.com/watch?v={trailer_path}')
    # FORM SUBMISSION
    if request.method == 'POST':
        form_cont = form.autocomp.data
        str2Match = form_cont
        strOptions = movie_list
        Ratios = process.extract(str2Match,strOptions)
        highest = process.extractOne(str2Match,strOptions)
        fuzzyresult = highest[0]
        movie_index = movies.loc[movies['title'] == fuzzyresult]
        movieID = str(movie_index['tmdbId'].iloc[0])
        # IF THE STRING IS AN EXACT MATCH, THERE IS NO NEED TO GO TO THE SEARCH PAGE. IF INPUT IS NOT GREATER THAN 1, DO NOTHING. 
        if form_cont == fuzzyresult:
            return redirect('../rec/' + movieID)
        elif len(form_cont) > 1:
            return redirect('../results/' + form_cont)
        else:
            pass
    titles = movies['title']
    indices = pd.Series(movies.index, index=movies['title'])
    idx = indices[movieTitle]
    # -----------------------------
    # ML BASED ON THE MOVIE GENRE
    # -----------------------------
    genre_sim_scores = list(enumerate(genre_cosine_sim[idx]))
    genre_sim_scores = sorted(genre_sim_scores, key=lambda x: x[1], reverse=True)
    genre_sim_scores = genre_sim_scores[1:21]
    genre_movie_indices = [i[0] for i in genre_sim_scores]
    # RETURNS THE 12 MOST SIMILAR MOVIES BY GENRE
    genre_df = titles.iloc[genre_movie_indices].head(13).to_frame()
    # ----------------------------
    # ML BASED ON THE MOVIE CAST
    # ----------------------------
    cast_sim_scores = list(enumerate(cast_cosine_sim[idx]))
    cast_sim_scores = sorted(cast_sim_scores, key=lambda x: x[1], reverse=True)
    cast_sim_scores = cast_sim_scores[1:21]
    cast_movie_indices = [i[0] for i in cast_sim_scores]
    # RETURNS THE 12 MOST SIMILAR MOVIES BY CAST
    cast_df = titles.iloc[cast_movie_indices].head(13).to_frame()
    # -----------------------------------
    # ML BASED ON THE MOVIE DESCRIPTION
    # -----------------------------------
    desc_sim_scores = list(enumerate(desc_cosine_sim[idx]))
    desc_sim_scores = sorted(desc_sim_scores, key=lambda x: x[1], reverse=True)
    desc_sim_scores = desc_sim_scores[1:21]
    desc_movie_indices = [i[0] for i in desc_sim_scores]
    # RETURNS THE 12 MOST SIMILAR MOVIES BY DESCRIPTION
    desc_df = titles.iloc[desc_movie_indices].head(13).to_frame()
    # ------------------------------------------------------------------
    # REMOVING SEARCH TITLE FROM RESULTS AND RETURNING 12 RECS
    # ------------------------------------------------------------------
    genre_df = genre_df[genre_df.title != movieTitle]
    genre_df = genre_df.head(12)
    cast_df = cast_df[cast_df.title != movieTitle]
    cast_df = cast_df.head(12)
    desc_df = desc_df[desc_df.title != movieTitle]
    desc_df = desc_df.head(12)
    # ------------------------------------------------------------------
    # PROSESSING RESULTS AND CREATING ONE LARGE DATAFRAME
    # ------------------------------------------------------------------
    mv = pd.concat([genre_df,cast_df,desc_df]).reset_index(drop=True)
    cols = ['title']
    temp_df = mv.join(movies.set_index(cols), on=cols)
    # GETTING MOVIE INFORMATION
    moviename = []
    url1 = []
    movCastin = titleloc['cast'].iloc[0]
    movCastOut = movCastin.replace("'","").strip("][").split(', ')
    topCast = []
    for x in range(3):
        topCast.append(movCastOut[x])
    # PULLS THE IMAGE URL FROM THE MOVIES DF AND APPENDS THEM TO THE URL PREFIX FOR THE MOVIE POSTERS
    # PASSES THE MOVIE POSTER URL INTO THE RECS.HTML PAGE
    titleurl = str("https://image.tmdb.org/t/p/original/" + titleloc['poster_path'].iloc[0])
    backdropPath = str(desc_data['backdrop_path'])
    bgurl = ("https://image.tmdb.org/t/p/original/" + backdropPath)
    runtime = str(desc_data['runtime'])
    for film in temp_df.tmdbId:
        moviename.append(film)
    for poster in temp_df.poster_path:
        url1.append("http://image.tmdb.org/t/p/w185" + str(poster))
    return render_template('recs.html', moviename=moviename, url1=url1, topCast=topCast, movieTitle=movieTitle, titleurl=titleurl, bgurl=bgurl, form=form, description=description, runtime=runtime, trailer_url=trailer_url)
Ejemplo n.º 38
0
def golden_source_merge(df_list, key, threshold=80, limit=1):
    """

    """

    # create null match columns

    matching_dict = {}

    df_1 = df_list.pop(0)
    df_1 = df_1[key]  # drop all other columns

    df_1.drop_duplicates(subset=key, inplace=True)  # drop duplicates

    for df_2 in df_list:

        df_2 = df_2[key]  # drop all other columns

        df_1['match_key'] = ''
        df_2['match_key'] = ''
        df_2.drop_duplicates(subset=key, inplace=True)  # drop duplicates

        # combines the list of column inputs into a single string for matching
        for value in key:
            df_1['match_key'] = df_1['match_key'].map(
                str) + ' ' + df_1[value].map(str)

        for value in key:
            df_2['match_key'] = df_2['match_key'].map(
                str) + ' ' + df_2[value].map(str)

        # remove periods for abreviated names
        df_1['match_key'] = df_1['match_key'].map(lambda x: x.strip(".,!"))
        df_2['match_key'] = df_2['match_key'].map(lambda x: x.strip(".,!"))

        # applies lower case and removes common words like "college" and "the"
        df_1['match_key'] = df_1['match_key'].apply(format_match_string)
        df_2['match_key'] = df_2['match_key'].apply(format_match_string)

        # the match process-creates the match keys to a list, matches, then saves them in the match column
        r = df_1['match_key'].tolist()
        s = df_2['match_key'].tolist()

        m = df_1['match_key'].apply(lambda x: process.extract(
            x, s, limit=limit, scorer=fuzzywuzzy.fuzz.token_sort_ratio))
        df_1['match'] = m


        df_2_matches = df_2['match_key'].apply(lambda x:
                                    process.extract(x, r, limit=limit, scorer=fuzzywuzzy.fuzz.token_sort_ratio))\
                                    .apply(lambda x: [i[1] for i in x if i[1] < threshold])\
                                    .apply(lambda x: 1 if x else 0)  # 0 if empty list

        df_2 = df_2.merge(df_2_matches.rename('not_matched'),
                          left_index=True,
                          right_index=True)

        matching_dict.update(
            return_matching_dictionary(df_2, df_1, key, threshold))

        df_2 = df_2.loc[df_2['not_matched'] == 1]

        # drop the score value and only keep the match words
        m2 = df_1['match'].apply(
            lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
        df_1['match'] = m2

        # merge based on the matches, suffixes to drop the columns later
        temp_df = df_1.merge(df_2,
                             left_on='match',
                             right_on='match_key',
                             suffixes=['', '_y'])

        # add back in df1 values that were dropped

        df_1 = pd.concat([df_1, temp_df]).drop_duplicates(key)

        # add in df2 values that weren't matched

        df_1 = pd.concat([df_1, df_2]).drop_duplicates(key)

        # drop the matching name columns since this is a left join
        df_1 = df_1[df_1.columns.drop(list(df_1.filter(regex='_y')))]
        df_1 = df_1[key]
        df_1.reset_index(drop=True, inplace=True)

    return df_1, matching_dict
Ejemplo n.º 39
0
def get_matches(query,choices):
    results = process.extract(query,choices)
    return results
Ejemplo n.º 40
0
 def get_best_plugin_name_match(self, plugin_name):
     choices = [p.replace('_', ' ') for p in self.bot.get_plugins_names()]
     plugin_name = plugin_name.replace('_', ' ')
     result = process.extract(plugin_name, choices, scorer=fuzz.token_sort_ratio)
     result = [(r[0].replace(' ', '_'), r[1]) for r in result]
     return result[0][0] if result[0][1] > 65 else None
# Append deduplicated census_b to census_a
full_census = census_a.append(unique_b)

# end timer
end = time.time()

### fuzzywuzzy

# Minimum Edit Distance (MED) is the least possible amount of steps needed to transition from one string to another. MED is calculated using only 4 operations, Insertion, Deletion, Substitution, Replacing consecutive characters

from fuzzywuzzy import fuzz

# The output returns a percentage between 0 and 100, 0 being not similar at all and 100 being identical:
fuzz.WRatio('Python', 'Cython')

# there are 4 other functions to compute string similarity:
fuzz.ratio
fuzz.partial_ratio
fuzz.token_sort_ratio
fuzz.token_set_ratio

# Extract Best Matches to a String from a List of Options

from fuzzywuzzy import process
string_to_match = 'Mercedez-Benz'
options = ['Ford', 'Mustang', 'mersedez benz', 'MAZDA', 'Mercedez']

process.extract(
    query=string_to_match, choices=options,
    limit=3)  # can adjust scoring methods by setting scorer=fuzz.ratio
Ejemplo n.º 42
0
from fuzzywuzzy import process

cities = ['Karachi', 'Lahore', 'Islamabad', 'Rawalpindi', 'Quetta', 'Peshawar', 'Gawadar', 'Multan', 'Hyderabad', 'Faisalabad'
          , 'Gujranwala', 'Rahim Yar Khan']

# limit = 3

result = process.extract('hk', cities, limit=3)

print(result)
file_glossary = "glossary.txt"

file_compare=sys.argv[1]

comparision_results={}

# For now we just do fuzzywuzzy compare and report highest score for each word in file_compare

with open(file_glossary) as filedata_glossary:
    lines_glossary = [line.rstrip('.md\n') for line in filedata_glossary]

with open(file_compare) as filedata_compare:
    lines_compare = [line.rstrip('\n') for line in filedata_compare]

for item_in_compare in lines_compare:
    comparision_results = process.extract(item_in_compare, lines_glossary)
    print(comparision_results)


# Workflow: turn document into a list of words and word count (data.csv)
# Compare to glossary, results.csv with word, count of word in document, then top 3 matches?
# have a cut off percentage?
    
#
# How to handle:
# 1) exact matches (100) of primary file and long term of alias
# 2) really close matches (perentage? additional string compares? do a simple poor mans test as well?) of primary file and long term of alias
# 3) stuff with no match of primary file and long term of alias?
# 4) how to handle the stop list, we'll need to put that data in as well
#
# We'll generate a CSV output file with the file_compare entries and then:
Ejemplo n.º 44
0
    names=['c', 'n'],
    index_col='c',
)
## Construct dictionary for country/region names
c_names = df_results.to_dict(
)['n']  #http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_dict.html
c_names_inv = {v: k for k, v in c_names.items()}

## Country names fuzzy match
from fuzzywuzzy import process

choice = []
for i, c_name_Web in enumerate(list_country_names_Web):

    #found_candidates = [x for x in c_names_inv.keys() if fuzzy_match(x,c_name_Web)==True]
    found_candidate = process.extract(c_name_Web, c_names_inv.keys(), limit=1)
    found_candidate_c = c_names_inv[found_candidate[0][0]]
    choice_item = [i, c_name_Web, found_candidate, found_candidate_c]
    #print (choice_item)
    choice.append(choice_item)

import ast
done = False
while not (done):
    try:
        # Note: Python 2.x users should use raw_input, the equivalent of 3.x's input
        prn = [repr(x) for x in choice]
        print("\n\r".join(prn))
        i = int(input("Please enter your corrections: Serial no (-1:None): "))
        if i == -1:
            print("Done!")
Ejemplo n.º 45
0
async def newquotes(ctx, phrase, source):
    titre = "Recherche en cours veuillez patienter"
    msg = await ctx.send(titre)
    if len(source) >= 5000:
        list_s = split_dict_to_multiple(source, round(len(source) / 10))
    else:
        list_s = [source]
    results = {}
    i = 1
    full = "█"
    empty = "░"
    for x in list_s:
        Ratios = process.extract(phrase, list(x))
        print(f"{i}/{len(list_s)}")
        if i % 2 == 0:
            await msg.edit(
                content=f"{titre}\n{full*i*2}{empty*((len(list_s)-i)*2)}")
        for r in Ratios:
            if r[1] < 87:
                break
            results[r[0]] = r[1]
        i += 1
    results = list(
        dict(sorted(results.items(), key=lambda item: item[1], reverse=True)))
    print(results)
    answer = ""
    if len(results) != 1:
        txt = "**__LISTE DES QUOTES TROUVEE__**\n"
        emoji = [
            "1️⃣", "2️⃣", "3️⃣", "4️⃣", "5️⃣", "6️⃣", "7️⃣", "8️⃣", "9️⃣", "🔟"
        ]

        used = {}
        i = 0
        for x in results:
            txt += f"{emoji[i]} {x} -> {source[x]['title']}, à {source[x]['time']}s\n"  # ({dict[x]['ep']}&t={dict[x]['time']})
            used[emoji[i]] = i
            i += 1
        await msg.edit(content=f"{txt}")
        for x in used:
            await msg.add_reaction(x)

        def check(reaction, user):
            return user == ctx.author and str(reaction.emoji) in used

        try:
            reaction, user = await bot.wait_for('reaction_add',
                                                timeout=40,
                                                check=check)
        except asyncio.TimeoutError:
            await msg.edit(content=f"Temps écoulé pour \"{phrase}\"")
            try:
                for x in used:
                    await msg.clear_reaction(x)
            except:
                print("pas les bons role")
            return
        answer = results[used[reaction.emoji]]
    else:
        answer = results[0]
    await msg.edit(content=f"{source[answer]['ep']}&t={source[answer]['time']}"
                   )
    try:
        for x in used:
            await msg.clear_reaction(x)
    except:
        print("pas les bons role")
Ejemplo n.º 46
0
 def get_phone_name(self, phone_name):
     return process.extract(phone_name, self.phone_list, limit=1)[0]
Ejemplo n.º 47
0
def handleQuery(query) -> list:
    """Hook that is called by albert with *every new keypress*."""  # noqa
    results = []

    if query.isTriggered:
        try:
            # be backwards compatible with v0.2
            if "disableSort" in dir(query):
                query.disableSort()

            results_setup = setup(query)
            if results_setup:
                return results_setup

            query_str = query.string

            # new behavior
            tokens = query_str.split()
            if len(tokens) >= 1 and tokens[0] == "new":
                if len(tokens) > 1:
                    name = tokens[1]
                else:
                    name = ""
                if len(tokens) > 2:
                    desc = " ".join(tokens[2:])
                else:
                    desc = ""

                results.append(
                    v0.Item(
                        id=__prettyname__,
                        icon=icon_path,
                        text=f"New abbreviation: {name}",
                        subtext=f"Description: {desc}",
                        actions=[
                            v0.FuncAction(
                                f"Save abbreviation to file",
                                lambda name=name, desc=desc: save_abbr(
                                    name, desc),
                            )
                        ],
                    ))

                return results

            curr_hash = hash_file(abbreviations_path)
            global abbr_latest_hash, abbr_latest_d, abbr_latest_d_bi
            if abbr_latest_hash != curr_hash:
                abbr_latest_hash = curr_hash
                with open(abbreviations_path) as f:
                    conts = f.readlines()
                    abbr_latest_d = make_latest_dict(conts)
                    abbr_latest_d_bi = abbr_latest_d.copy()
                    abbr_latest_d_bi.update(
                        {v: k
                         for k, v in abbr_latest_d.items()})

            if not abbr_latest_d:
                results.append(
                    v0.Item(
                        id=__prettyname__,
                        icon=icon_path,
                        text=
                        f'No lines split by "{split_at}" in the file provided',
                        actions=[
                            v0.ClipAction(
                                f"Copy provided filename",
                                str(abbreviations_path),
                            )
                        ],
                    ))

                return results

            # do fuzzy search on both the abbreviations and their description
            matched = process.extract(query_str,
                                      abbr_latest_d_bi.keys(),
                                      limit=10)
            for m in [elem[0] for elem in matched]:
                if m in abbr_latest_d.keys():
                    results.append(get_abbr_as_item((m, abbr_latest_d[m])))
                else:
                    results.append(get_abbr_as_item((abbr_latest_d_bi[m], m)))

        except Exception:  # user to report error
            if dev_mode:  # let exceptions fly!
                print(traceback.format_exc())
                raise

            results.insert(
                0,
                v0.Item(
                    id=__prettyname__,
                    icon=icon_path,
                    text=
                    "Something went wrong! Press [ENTER] to copy error and report it",
                    actions=[
                        v0.ClipAction(
                            f"Copy error - report it to {__homepage__[8:]}",
                            f"{traceback.format_exc()}",
                        )
                    ],
                ),
            )

    return results
Ejemplo n.º 48
0
def create_entity_node_relationships(df,
                                     entity_name,
                                     global_id_counter,
                                     levenshtein_thresh=None):
    raw_entity_df = get_entity_df(df, entity_name.upper())
    entity_counts = raw_entity_df.name.str.lower().value_counts()
    if "" in entity_counts:
        del entity_counts[""]

    if levenshtein_thresh:
        most_common = entity_counts[:levenshtein_thresh]
        most_common_set = set(most_common.index)

        raw_to_resolved = {}
        resolved_entity_counts = defaultdict(int)

        for name in most_common.index:
            resolved_entity_counts[name] += most_common[name]

        for name in entity_counts.index:
            if name in most_common_set:
                continue
            (candidate_name,
             candidate_score), = process.extract(name,
                                                 most_common_set,
                                                 limit=1,
                                                 scorer=fuzz.ratio)
            if candidate_score > 89:
                raw_to_resolved[name] = candidate_name
                resolved_entity_counts[candidate_name] += entity_counts[name]
            else:
                resolved_entity_counts[name] += entity_counts[name]
    else:
        raw_to_resolved = {}
        resolved_entity_counts = entity_counts

    resolved_entity_count_df = pd.DataFrame(
        {"mentions": resolved_entity_counts})
    resolved_entity_count_df["id"] = [
        str(next(global_id_counter))
        for _ in range(len(resolved_entity_count_df))
    ]
    resolved_entity_count_df["name"] = resolved_entity_count_df.index
    resolved_entity_count_df = resolved_entity_count_df.set_index("id",
                                                                  drop=False)

    entity_df_n4j = pd.DataFrame({
        "entity{entity}Id:ID".format(entity=entity_name.capitalize()):
        resolved_entity_count_df.id,
        "name":
        resolved_entity_count_df.name,
        "mentions:int":
        resolved_entity_count_df.mentions,
        ":LABEL":
        "Entity_{entity}".format(entity=entity_name.capitalize())
    })

    save_node = "neo4j-csv/entity_{entity}.csv".format(entity=entity_name)
    entity_df_n4j.to_csv(save_node, index=False)

    raw_entity_df = raw_entity_df.drop_duplicates()
    raw_entity_df["name_format"] = raw_entity_df.name
    raw_entity_df["name_lower"] = raw_entity_df.name.str.lower()
    raw_entity_df["name"] = raw_entity_df.name.str.lower().apply(
        lambda n: raw_to_resolved[n] if n in raw_to_resolved else n)
    relationship_df = pd.merge(resolved_entity_count_df,
                               raw_entity_df,
                               on='name')

    mentions_n4j = pd.DataFrame({
        ":START_ID": relationship_df.emailId,
        ":END_ID": relationship_df.id,
        "as": relationship_df.name_format,
        ":TYPE": "MENTION"
    })

    save_relationship = "neo4j-csv/mentions_{entity}.csv".format(
        entity=entity_name)
    mentions_n4j.to_csv(save_relationship, index=False)
Ejemplo n.º 49
0
def search(list_to_search: list,
           value,
           key,
           cutoff=5,
           return_key=False,
           strict=False):
    """Fuzzy searches a list for an object
    result can be either an object or list of objects
    :param list_to_search: The list to search.
    :param value: The value to search for.
    :param key: A function defining what to search for.
    :param cutoff: The scorer cutoff value for fuzzy searching.
    :param return_key: Whether to return the key of the object that matched or the object itself.
    :param strict: If True, will only search for exact matches.
    :returns: A two-tuple (result, strict)"""
    # there is nothing to search
    if len(list_to_search) == 0:
        return [], False

    # full match, return result
    exact_matches = [
        a for a in list_to_search if value.lower() == key(a).lower()
    ]
    if not (exact_matches or strict):
        partial_matches = [
            a for a in list_to_search if value.lower() in key(a).lower()
        ]
        if len(partial_matches) > 1 or not partial_matches:
            names = [key(d).lower() for d in list_to_search]
            fuzzy_map = {key(d).lower(): d for d in list_to_search}
            fuzzy_results = [
                r for r in process.extract(
                    value.lower(), names, scorer=fuzz.ratio) if r[1] >= cutoff
            ]
            fuzzy_sum = sum(r[1] for r in fuzzy_results)
            fuzzy_matches_and_confidences = [
                (fuzzy_map[r[0]], r[1] / fuzzy_sum) for r in fuzzy_results
            ]

            # display the results in order of confidence
            weighted_results = []
            weighted_results.extend(
                (match, confidence)
                for match, confidence in fuzzy_matches_and_confidences)
            weighted_results.extend((match, len(value) / len(key(match)))
                                    for match in partial_matches)
            sorted_weighted = sorted(weighted_results,
                                     key=lambda e: e[1],
                                     reverse=True)

            # build results list, unique
            results = []
            for r in sorted_weighted:
                if r[0] not in results:
                    results.append(r[0])
        else:
            results = partial_matches
    else:
        results = exact_matches

    if len(results) > 1:
        if return_key:
            return [key(r) for r in results], False
        else:
            return results, False
    elif not results:
        return [], False
    else:
        if return_key:
            return key(results[0]), True
        else:
            return results[0], True
Ejemplo n.º 50
0
def scrape():
    browser = init_browser()

    receivingURL = 'https://nextgenstats.nfl.com/stats/receiving#yards'
    browser.visit(receivingURL)
    html = browser.html
    soup = bs(html, 'html.parser')

    receivingtable=pd.read_html(str(soup.find_all('table')))
    recdf= pd.DataFrame(receivingtable[1])
    reccolumnlist = receivingtable[0].values.tolist()[0]
    reccolumnlist.pop()
    recdf.columns = reccolumnlist
    recdf = recdf.rename(index=str, columns={"+/-Avg .YAC Above Expectation":"+/-Avg YAC Above Expectation"})
    recdf_dict = recdf.to_dict(orient='records')

    rushingURL = 'https://nextgenstats.nfl.com/stats/rushing#yards'
    browser.visit(rushingURL)
    html = browser.html
    soup = bs(html, 'html.parser')

    rushingtable=pd.read_html(str(soup.find_all('table')))
    rushdf= pd.DataFrame(rushingtable[1])
    rushcolumnlist = rushingtable[0].values.tolist()[0]
    rushcolumnlist.pop()
    rushdf.columns = rushcolumnlist
    rushdf_dict = rushdf.to_dict(orient='records')
    
    passingURL = 'https://nextgenstats.nfl.com/stats/passing#yards'
    browser.visit(passingURL)
    html = browser.html
    soup = bs(html, 'html.parser')

    passingtable=pd.read_html(str(soup.find_all('table')))
    passdf= pd.DataFrame(passingtable[1])
    passcolumnlist = passingtable[0].values.tolist()[0]
    passcolumnlist.pop()
    passdf.columns = passcolumnlist
    passdf_dict = passdf.to_dict(orient='records')

    # List of each week number
    weeks = list(range(1,18))
    # Api Url
    base_url = "http://api.fantasy.nfl.com/v1/players/stats?statType=weekStats&season=2018&week={}&form=json"
    temp_final_df = pd.DataFrame()
    for week in weeks:
        target_url = base_url.format(week)
        temp = requests.get(target_url).json()['players']
        temp_df = pd.DataFrame(temp)
        temp_df = temp_df.drop(columns = 'stats')
        temp_df['week'] = week
        temp_final_df = temp_final_df.append(temp_df)
    team_names = temp_final_df.teamAbbr.unique()

    temp_dict = temp_final_df.to_dict(orient='records')

    data = pd.read_csv(r'C:\\Users\\rirvi\Documents\\NFLETL\\2018_Schedule_City.csv')
    data['away_abrev'] = data['Away'].str[0:4]
    data['home_abrev'] = data['Home'].str[0:4]

    away_abrev = []

    for awy in data['away_abrev']:
        away_abrev.append(process.extract(awy,team_names)[0][0])

    home_abrev = []

    for hme in data['home_abrev']:
        home_abrev.append(process.extract(hme,team_names)[0][0])

    data['away_abrev'] = away_abrev
    data['home_abrev'] = home_abrev
    schedule_dict = data.to_dict(orient='records')

    return passdf_dict, rushdf_dict, recdf_dict, temp_dict, schedule_dict
Ejemplo n.º 51
0
def albertson_main(file, pages):
    temp_directory = tempfile.TemporaryDirectory(dir=document_location)
    input_pdf_location = f'{temp_directory.name}/input_pdf.pdf'
    final_dict = {}
    overall_content_list = []
    input_pdf = get_smb_or_local(file, input_pdf_location)
    pages = pages
    pdfplumber_pdf = pdfplumber.open(input_pdf)
    for page in pages.split(','):
        print(f'{page}')
        if int(page) - 1 in range(len(pdfplumber_pdf.pages)):
            page_dict = {}
            tables = camelot.read_pdf(input_pdf,
                                      pages=page,
                                      flavor='stream',
                                      row_tol=12,
                                      edge_tol=500)
            no_of_tables = len(tables)
            chunked_df = {}
            for table_no in range(no_of_tables):
                chunk_index_list = []
                chunk = {}
                df = tables[table_no].df
                rows, columns = df.shape
                for column in range(columns):
                    for row in range(rows):
                        search_query = df[column][row]
                        for title, regex_pattern in title_card_dict.items():
                            if re.search(r"{}".format(regex_pattern),
                                         search_query, re.I):
                                # print(search_query)
                                chunk_index_list.append({
                                    'title': title,
                                    'index': row
                                })
                            else:
                                chunk_index_list.append({
                                    'title': 'NO TITLE',
                                    'index': 0
                                })
                # chunk_index_list = sorted(chunk_index_list,key = lambda x: x['index'])
                chunk_index_list = sorted(list({
                    frozenset(list_element.items()): list_element
                    for list_element in chunk_index_list
                }.values()),
                                          key=lambda d: d['index'])
                for index, title_index_dict in enumerate(chunk_index_list):
                    # print(title_index_dict)
                    try:
                        chunk[title_index_dict['title']] = [
                            title_index_dict['index'],
                            chunk_index_list[index + 1]['index']
                        ]
                    except:
                        chunk[title_index_dict['title']] = [
                            title_index_dict['index']
                        ]
                for title, chunk_list in chunk.items():
                    # print(chunk_list)
                    try:
                        temp_df = df.loc[chunk_list[0]:chunk_list[1] - 1]
                        temp_df = temp_df.reset_index(drop=True)
                        rows, columns = temp_df.shape
                        if columns > 2:
                            out = is_certification_or_nutrition(temp_df)
                            # print('output---------->', out)
                            if out not in ['None']:
                                title = out
                            else:
                                pass
                        chunked_df[title] = temp_df
                    except:
                        temp_df = df.loc[chunk_list[0]:]
                        temp_df = temp_df.reset_index(drop=True)
                        rows, columns = temp_df.shape
                        if columns > 2:
                            out = is_certification_or_nutrition(temp_df)
                            # print('output---------->', out)
                            if out not in ['None']:
                                title = out
                            else:
                                pass
                                # temp_df = temp_df.drop([0],axis='columns')
                                # temp_df.columns = range(temp_df.shape[1])
                        chunked_df[title] = temp_df
            for df_title, dataframe in chunked_df.items():
                if df_title not in [
                        "COMPANY CONTACT INFORMATION", "NUTRITION",
                        "NUTRITION_SERVING", "CERTIFICATIONS"
                ]:
                    rows, columns = dataframe.shape
                    if columns >= 2 and rows > 1:
                        # dataframe preprocessing
                        df_pro = dataframe.applymap(cleaning_unwanted_titles)
                        df_pro = cleaning_unwanted_headers(df_pro)
                        df_pro = noise_removal_1(df_pro)
                        df_pro = df_pro.applymap(cleaning_sub_texts)
                        df_pro = noise_removal_2(df_pro)
                        df_pro = df_pro.applymap(check_nan)
                        df_pro = df_pro.dropna(axis=1, how='all')
                        df_pro = df_pro.dropna(axis=0, how='all')
                        df_pro.columns = range(df_pro.shape[1])
                        df_pro = df_pro.reset_index(drop=True)
                        df_pro = ffill_block_strategy(df_pro)
                        df_pro[0].fillna(method='ffill', axis=0, inplace=True)
                        df_pro = df_pro.applymap(convert_for_bfill_strategy)
                        df_pro[0].fillna(method='bfill', axis=0, inplace=True)
                        df_pro[0].fillna(method='ffill', axis=0, inplace=True)
                        df_pro.fillna('', inplace=True)
                        content_dict, content_list = normal_content_processing(
                            df_pro)
                        overall_content_list.extend(content_list)
                        if page != '1':
                            plumber_content_list = get_overall_content(
                                input_pdf, page)
                            plumber_content_list = list(
                                set(plumber_content_list))
                            unmapped_element = []
                            for plumber_content in plumber_content_list:
                                _, plumb_score = process.extract(
                                    plumber_content.lower(),
                                    overall_content_list,
                                    scorer=fuzz.partial_token_set_ratio)[0]
                                _, plumb_score1 = process.extract(
                                    plumber_content.lower(),
                                    overall_content_list,
                                    scorer=fuzz.ratio)[0]
                                # print(plumber_content,plumb_score)
                                if (plumb_score < 90) or (plumb_score > 90 and
                                                          plumb_score1 < 70):
                                    unmapped_element.append(plumber_content)
                            # unmapped_element = list(set(plumber_content_list)-set(content_list))
                            for content in unmapped_element:
                                output = base(
                                    'general',
                                    model_location).prediction(content)
                                print(output)
                                if output['output'] in ['ingredients']:
                                    if 'INGREDIENTS_DECLARATION' in content_dict:
                                        content_dict[
                                            'INGREDIENTS_DECLARATION'].append(
                                                {'en': content})
                                    else:
                                        content_dict[
                                            'INGREDIENTS_DECLARATION'] = [{
                                                'en':
                                                content
                                            }]
                                else:
                                    if 'unmapped' in content_dict:
                                        content_dict['unmapped'].append(
                                            {'en': content})
                                    else:
                                        content_dict['unmapped'] = [{
                                            'en':
                                            content
                                        }]
                            # print('****' * 5)
                            # print('unmapped element----->',unmapped_element)
                            # print('content list----->',content_list)
                            # print('plumber content list-------->',plumber_content_list)
                            # print('*******' * 6)

                        page_dict = {**page_dict, **content_dict}
                    else:
                        pass
                else:
                    rows, columns = dataframe.shape
                    if columns >= 2 and rows >= 1:
                        # if df_title == "NUTRITION":
                        if "NUTRITION" in df_title:
                            nutrition_data = nutrition_processing(dataframe)
                            try:
                                if 'serving size' in page_dict:
                                    page_dict['serving size'].append({
                                        'en':
                                        nutrition_data['serving size'][0]
                                    })
                                else:
                                    page_dict['serving size'] = [{
                                        'en':
                                        nutrition_data['serving size'][0]
                                    }]
                                nutrition_data.pop('serving size', None)
                            except:
                                pass
                            try:
                                if 'varied' in page_dict:
                                    page_dict['varied'].append(
                                        {'en': nutrition_data['varied'][0]})
                                else:
                                    page_dict['varied'] = [{
                                        'en':
                                        nutrition_data['varied'][0]
                                    }]
                                nutrition_data.pop('varied', None)
                            except:
                                pass
                            if nutrition_data:
                                if 'NUTRITION_FACTS' in page_dict:
                                    page_dict['NUTRITION_FACTS'].append(
                                        nutrition_data)
                                else:
                                    page_dict['NUTRITION_FACTS'] = [
                                        nutrition_data
                                    ]
                            # print(nutrition_data)
                        elif df_title == "CERTIFICATIONS":
                            # print('inside certification')
                            df_pro = dataframe.applymap(
                                cleaning_unwanted_titles)
                            df_pro = cleaning_unwanted_headers(df_pro)
                            df_pro = noise_removal_1(df_pro)
                            df_pro = df_pro.applymap(cleaning_sub_texts)
                            df_pro = noise_removal_2(df_pro)
                            df_pro = df_pro.applymap(check_nan)
                            df_pro = df_pro.dropna(axis=1, how='all')
                            df_pro = df_pro.dropna(axis=0, how='all')
                            df_pro.columns = range(df_pro.shape[1])
                            df_pro = df_pro.reset_index(drop=True)
                            df_pro[0].fillna(method='ffill',
                                             axis=0,
                                             inplace=True)
                            df_pro[0].fillna(method='bfill',
                                             axis=0,
                                             inplace=True)
                            df_pro.fillna('', inplace=True)
                            certification_data = certifications_processing(
                                df_pro)
                            page_dict = {**page_dict, **certification_data}
            final_dict[page] = page_dict
    try:
        temp_directory.cleanup()
    except:
        pass
    return final_dict
Ejemplo n.º 52
0
 def search_columns(self, colname, limit=5):
     from fuzzywuzzy import process
     extracted = process.extract(colname,
                                 self.all_columns.keys(),
                                 limit=limit)
     return [x[0] for x in extracted]
Ejemplo n.º 53
0
correct_with_rating = []

#initial csvs are read in here, madris dataframe is the one most likely to be altered
df_madris = pd.read_csv("madris.csv")
df_harvard = pd.read_csv("my_harvard.csv")

df_madris['DEGREE DESCR'] = 'X' + df_madris['DEGREE DESCR'].astype(str)
df_madris['DEGREE DESCR'] = df_madris['DEGREE DESCR'].str.upper()

df_temp_madris = df_madris['DEGREE DESCR'].values
df_temp_harvard = df_harvard['Degree'].values

#using extract rather than extractone shows first and second best fit to the tested data
for row in df_temp_madris:
    x = process.extract(row, df_temp_harvard, limit=2)
    correct_with_rating.append({
        "Correct_Degree_1(my.harvard)": x[0],
        "Tested_Degree(madris)": row.strip(" "),
        "Correct_Degree_2(my.harvard)": x[1],
        "Rating_best_fit": x[0][1]
    })

df_processed_data = pd.DataFrame(correct_with_rating)

df_processed_data = df_processed_data.sort_values(by="Rating_best_fit",
                                                  ascending=False)

# columnsTitles=["Correct_Degree(my.harvard)","Tested_Degree(madris)","Rating"]
# df_processed_data=df_processed_data.reindex(columns=columnsTitles)
print(df_processed_data)
Ejemplo n.º 54
0
for key in pdk45_csv.keys():
    if key != "name":
        layers45.append(key.split('LAYER')[0])

layers15 = []
for key in pdk15_csv.keys():
    if key != "name":
        layers15.append(key.split('LAYER')[0])

final_pairs = []
list1 = layers45
list2 = layers15
while True:
    pairs = []
    for key in list1:
        result = process.extract(key, list2, limit=2, scorer=scorer)
        match = result[0][0]
        score = result[0][1]
        pairs.append((key, match, score))
        # print(key, ':', match, score)

    max_score = 50
    perfect_match = False
    best_pair = []
    for pair in pairs:
        key, match, score = pair
        if score == 100:
            perfect_match = True
            list1.remove(key)
            list2.remove(match)
            print("matching", key, match, score)
Ejemplo n.º 55
0
def company_name(content_and_scores):
    content = content_and_scores[0]
    sentiment = content_and_scores[1]
    wordtoke = nltk.word_tokenize(content)
    wordtag = nltk.pos_tag(wordtoke)
    nouns = findtags("NN", wordtag)
    main_noun = nouns['NNP'][0][0]
    #print(main_noun)

    poscomp = []
    pronouns = nouns['NNP']
    try:
        pronouns_s = nouns['NNPS']
        for pronoun in pronouns:
            #print(pronoun[0])
            poscomp.append(pronoun[0])
        for p in pronouns_s:
            poscomp.append(p[0])
            #print(pronouns)
    except:
        for pronoun in pronouns:
            #print(pronoun[0])
            poscomp.append(pronoun[0])
    #print(poscomp)

    path1 = "/Users/Master Soe/webscrap/Stock Companies/A-Z_companies.csv"
    f = open(path1, newline='')
    reader = csv.reader(f)
    data = [row for row in reader]
    path2 = "/Users/Master Soe/webscrap/Stock Companies/companynames.txt"
    k = open(path2, newline='')
    text = k.read()
    k.close()
    company = poscomp
    badwordlist = [
        "Corporation", "Co.", "Incorporated", "Inc.", "Company",
        "Communications"
        "Fund", "Trust", "Investment", "Associates", "NYSE", "NASDAQ", "Stock",
        "Securities", "Bloomberg"
    ]
    #try:
    #for comp in poscomp:
    #if re.search(comp, text):
    #company.append(comp)
    #print(company)
    #raise StopIteration
    #else:
    #pass
    #print("No company found and Sam is f****t")
    #except StopIteration:
    #pass

    #print(company)
    scores1 = []
    companylist = []
    TICKERlist = []

    # Gets data from data lol
    for companydetails in data:
        #print(companydetails[1])
        companylist.append(companydetails[1])
        TICKERlist.append(companydetails[0])

    # Checks if there are any common words in the list
    for badword in badwordlist:
        for fx in company:
            if re.search(fx, badword):
                company.remove(fx)
            else:
                pass
    # Removes \\ words
    for x in company:
        if re.search('\\\\', x):
            company.remove(x)
        else:
            pass
    for x in company:
        if re.search('\\\\', x):
            company.remove(x)
        else:
            pass
    print("SENTIMENT SCORE", sentiment)
    print("List of possible companies")
    print(company)
    # Makes permutations for the word
    #for l in data:
    #    companylist.append(l[1])
    #_gen = (itertools.permutations(company, i + 1) for i in range(len(company)))
    #all_permutations_gen = itertools.chain(*_gen)
    #results = [x for x in all_permutations_gen]
    #k = []
    # Arranges the combinations into a readable array
    #for x in results:
    #    j = ""
    #    for y in x:
    #        j = j + y + " "
    #    k.append(j)
    #scores2 = [""]
    #print(k)

    # Checks the match score of word to a company
    for x in company:
        #print(x)
        if re.search('[a-z]+', x) is None:
            possible_company_score = process.extract(x, TICKERlist, limit=3)
        else:
            possible_company_score = process.extract(x, companylist, limit=3)
            #print(possible_company_score)
            #print(companylistx)
        for x in possible_company_score:
            scores1.append(x)

    #print(scores1)
    c = Counter(scores1)
    guess_company = c.most_common()
    #print(guess_company)
    i = []
    for g in guess_company:
        i.append(g[0])
    print(" ")

    def custom_sort(t):
        return t[1]

    i.sort(key=custom_sort, reverse=True)
    #print(i)
    print("The stock company(s) is", i[0:5])
    #except:
    #pass

    return guess_company
Ejemplo n.º 56
0
 def fuzzy_wuzzy(m):
     movie_title = process.extract(m, movie_titles)[0][0]
     return movie_title
Ejemplo n.º 57
0
def didyoumean(input_command):
    return process.extract(input_command,
                           command_lib.keys(),
                           scorer=fuzz.partial_ratio,
                           limit=1)[0][0]
Ejemplo n.º 58
0
cou2 = country.sort_values('country_txt')
# 原数据中名称字符分隔符删除
cou2['country_txt'] = cou2['country_txt'].str.replace(' ', '')
cou = pd.merge(cou1,
               cou2,
               left_on=['国家'],
               right_on=['country_txt'],
               how='outer')
# 将不能匹配的进行模糊匹配
a = cou[cou.isnull().any(axis=1)]
a1 = a['国家'].dropna().values.tolist()
a2 = a['country_txt'].dropna().values.tolist()

dic = dict()
for t in a2:
    x = process.extract(t, a1, limit=2)
    if (x[0][1] >= 68 or x[0][1] == 50) and (x[0][1] != 72):
        dic[t] = x[0][0]
t = [a for a, b in enumerate(cou['country_txt']) if b in dic]
cou.iloc[t, 0] = list(dic.values())
cou = cou.iloc[:, :2].dropna()
cou['country'] = cou['country'].astype(int)
# 将国家编号并入eco经济数据集
eco1 = pd.merge(eco, cou, on=['国家'], how='right')
eco1.rename(columns={'年份': 'iyear'}, inplace=True)
eco1 = eco1.drop('国家', axis=1)

#将经济数据按照 国家编号、年份汇入 恐怖袭击 表格
eco1 = eco1.sort_values(['iyear', 'country'])
data4 = pd.merge(data3, eco1, on=['country', 'iyear'], how='left')
Ejemplo n.º 59
0
def search():

    if request.method == "OPTIONS":  # CORS preflight
        return _build_cors_prelight_response()
    elif request.method == 'GET':
        totalpages = 0
        content = request.get_json()
        print(content)
        Filename = "./contractCollection.csv"
        df = pd.read_csv(Filename, error_bad_lines=False)
        df = df.fillna("False")
        result = []
        clause_category = ""
        tag = ""
        cont_type = ""
        text = ""
        pages = 0
        try:
            print("try")
            cont_type = content["cont_type"]
            clause_category = content["clause_category"]
            tag = content["tag"]

            text = content["text"]
            pages = content["page"]
        except:
            return _corsify_actual_response(jsonify("parameters error"))
        if cont_type != "" and clause_category != "":
            contain_values = df[df['name'].str.contains(cont_type)]
            result = contain_values[df['ClausesCategories'].str.contains(
                clause_category)]
            if (len(result)) == 0:
                result = [1, 2]
        elif cont_type != "":
            result = df[df['name'].str.contains(cont_type)]
            if (len(result)) == 0:
                result = [1, 2]
        elif clause_category != "":
            result = df[df['ClausesCategories'].str.contains(clause_category)]
            if (len(result)) == 0:
                result = [1, 2]
        else:
            result = [1, 2]
        #print ("result",len(result))

        if len(result) > 0:
            Filename = "./ClausesCategoriesCollection.csv"
            df2 = pd.read_csv(Filename, error_bad_lines=False)
            df2 = df2.fillna("False")
            claid = df2["_id"].tolist()
            claname = df2["name"].tolist()
            if clause_category != "":  #########if category is empty
                contain_values = df2[df2['name'].str.contains(
                    clause_category)]  ##if caluse category exist
                if len(contain_values) > 0:
                    ids = int(contain_values["_id"])
                    Filename = "./ClauseCollection.csv"
                    df3 = pd.read_csv(Filename, error_bad_lines=False)
                    df3 = df3.fillna("False")
                    #df3['clauseID']=pd.to_numeric(df3['clauseID'])
                    rows = df3.loc[(df3['tags'] == tag)
                                   & (df3['clauseID'] == ids)]
                    data = []
                    totalpages = math.ceil(len(rows) / 10)
                    print("pages", len(rows), totalpages)
                    rows = rows.iloc[pages:]
                    for index, row in rows.iterrows():
                        if (len(data) < 10):
                            print("row", row["_id"])
                            #data.append({"name":row["name"],"description":row["description"]})
                            data.append({
                                "description":
                                row["description"],
                                "clause_type":
                                row["name"],
                                "category":
                                claname[claid.index(row["clauseID"])],
                                "tag":
                                row["tags"]
                            })
                    #print("data",data)
                    if len(data) == 0:
                        rows = df3.loc[(df3['clauseID'] == ids)]
                        data = []
                        totalpages = math.ceil(len(rows) / 10)
                        print("pages", len(rows), totalpages)
                        rows = rows.iloc[pages:]
                        for index, row in rows.iterrows():
                            if (len(data) < 10):
                                print("row", row["_id"])
                                print("calid", claid.index(row["clauseID"]))
                                print("claname",
                                      claname[claid.index(row["clauseID"])])
                                #data.append({"name":row["name"],"description":row["description"]})
                                data.append({
                                    "description":
                                    row["description"],
                                    "clause_type":
                                    row["name"],
                                    "category":
                                    claname[claid.index(row["clauseID"])],
                                    "tag":
                                    row["tags"]
                                })
                        array2 = []
                        array2.append({"pages": totalpages})
                        array2.append({"data": data})

                        return _corsify_actual_response(jsonify(array2))
                    array2 = []
                    array2.append({"pages": totalpages})
                    array2.append({"data": data})
                    return _corsify_actual_response(jsonify(array2))
                    ####return data
                    #print("rows",rows,df3.dtypes)
                elif tag != "":  ####################if category does not exist in records but tag exist
                    #ids=int(contain_values["_id"])
                    Filename = "./ClauseCollection.csv"
                    df3 = pd.read_csv(Filename, error_bad_lines=False)
                    df3 = df3.fillna("False")
                    #df3['clauseID']=pd.to_numeric(df3['clauseID'])
                    rows = df3.loc[(df3['tags'] == tag)]
                    data = []
                    totalpages = math.ceil(len(rows) / 10)
                    print("pages", len(rows), totalpages)
                    rows = rows.iloc[pages:]
                    for index, row in rows.iterrows():
                        if (len(data) < 10):
                            print("calid", claid.index(row["clauseID"]))
                            print("claname",
                                  claname[claid.index(row["clauseID"])])
                            #print("row",row["_id"])
                            #data.append({"name":row["name"],"description":row["description"]})
                            data.append({
                                "description":
                                row["description"],
                                "clause_type":
                                row["name"],
                                "category":
                                claname[claid.index(row["clauseID"])],
                                "tag":
                                row["tags"]
                            })
                        else:
                            break
                    #print("data tag exist",data)
                    array2 = []
                    array2.append({"pages": totalpages})
                    array2.append({"data": data})
                    return _corsify_actual_response(jsonify(array2))
                    ####return data
                elif text != "":  #########tag does not exist but text exist
                    Filename = "./ClauseCollection.csv"
                    df3 = pd.read_csv(Filename, error_bad_lines=False)
                    df3 = df3.fillna("False")
                    entities = df3['name'].tolist()
                    descriptions = df3['description'].tolist()
                    tagss = df3['tags'].tolist()
                    claidss = df3['clauseID'].tolist()
                    results = process.extract(text,
                                              descriptions,
                                              scorer=fuzz.token_sort_ratio)
                    #print(results)
                    data = []
                    print(results[0][0], results[0][1])
                    import random
                    n = random.randint(1, 10)
                    totalpages = 1
                    print("pages", totalpages)
                    for x in results:
                        if (len(data) < n):

                            #data.append({"name":entities[descriptions.index(x[0])],"description":x[0]})
                            data.append({
                                "description":
                                descriptions[claid.index(x[1])],
                                "clause_type":
                                entities[claid.index(x[1])],
                                "category":
                                claname[claidss[claid.index(x[1])]],
                                "tag":
                                tagss[claid.index(x[1])]
                            })
                        else:
                            break
                    #print("data",data)
                    array2 = []
                    array2.append({"pages": totalpages})
                    array2.append({"data": data})
                    return _corsify_actual_response(jsonify(array2))
                else:  ############if text not exist
                    #print("g aya no")
                    return _corsify_actual_response(jsonify({}))

            elif tag != "":  ############if clause category does not exist but tag exist
                Filename = "./ClauseCollection.csv"
                df3 = pd.read_csv(Filename, error_bad_lines=False)
                df3 = df3.fillna("False")
                #df3['clauseID']=pd.to_numeric(df3['clauseID'])
                rows = df3.loc[(df3['tags'] == tag)]
                data = []
                totalpages = math.ceil(len(rows) / 10)
                print("pages", len(rows), totalpages)
                rows = rows.iloc[pages:]
                for index, row in rows.iterrows():
                    if (len(data) < 10):
                        print("row", row["_id"])
                        print("calid", claid.index(row["clauseID"]))
                        print("claname", claname[claid.index(row["clauseID"])])
                        #data.append({"name":row["name"],"description":row["description"]})
                        data.append({
                            "description":
                            row["description"],
                            "clause_type":
                            row["name"],
                            "category":
                            claname[claid.index(row["clauseID"])],
                            "tag":
                            row["tags"]
                        })
                #print("data if tag exist only",len(data))
                array2 = []
                array2.append({"pages": totalpages})
                array2.append({"data": data})
                return _corsify_actual_response(jsonify(array2))
            elif text != "":  #########tag does not exist but text exist
                Filename = "./ClauseCollection.csv"
                df3 = pd.read_csv(Filename, error_bad_lines=False)
                df3 = df3.fillna("False")
                entities = df3['name'].tolist()
                descriptions = df3['description'].tolist()
                tagss = df3['tags'].tolist()
                claidss = df3['clauseID'].tolist()
                results = process.extract(text,
                                          descriptions,
                                          scorer=fuzz.token_sort_ratio)
                #print(results)
                data = []
                import random
                n = random.randint(1, 10)
                totalpages = 1
                print("pages", totalpages)
                #print(results[0][0],results[0][1])
                for x in results:
                    if (len(data) < 10):
                        print("results", results)
                        print("calid", claid.index(x[1]))
                        print("claname", claname[claid.index(x[1])])
                        #data.append({"name":entities[descriptions.index(x[0])],"description":x[0]})
                        data.append({
                            "description":
                            descriptions[claid.index(x[1])],
                            "clause_type":
                            entities[claid.index(x[1])],
                            "category":
                            claname[claidss[claid.index(x[1])]],
                            "tag":
                            tagss[claid.index(x[1])]
                        })
                    else:
                        break
                #print("data",data)
                array2 = []
                array2.append({"pages": totalpages})
                array2.append({"data": data})
                return _corsify_actual_response(jsonify(array2))
            else:  ############if text not exist
                return _corsify_actual_response(jsonify({}))
        #print(clause_category)
        #data = request.values
        #print("coming",request.form["id"])

        #id=str(request.form["id"])
        #print(id,type(id))
        #response=model.recommendation(id)

    else:
        return _corsify_actual_response(jsonify("error"))
Ejemplo n.º 60
0
 def _get_matches(self, name, match_limit):
     return process.extract(name,
                            self.friends,
                            scorer=fuzz.UWRatio,
                            limit=match_limit)