Ejemplo n.º 1
0
 def get_byline_image(self, force_new=False):
     slugify = Slugify()
     if not force_new and self.byline_photo:
         return self.byline_photo
     imagefiles = glob.glob(os.path.join(settings.BYLINE_PHOTO_DIR, "*.jpg"))
     name = self.name
     name_last_first = re.sub(r"^(.*) (\S+)$", r"\2 \1", name)
     name_slug_title = slugify(name) + ".jpg"
     name_slug = name_slug_title.lower()
     name_slug_reverse = slugify(name_last_first).lower() + ".jpg"
     bestratio = 90
     bestmatch = None
     for path in imagefiles:
         filename = os.path.split(path)[1].lower()
         ratio = max(fuzz.ratio(filename, name_slug), fuzz.ratio(filename, name_slug_reverse))
         if ratio > bestratio:
             bestmatch = path
             bestratio = ratio
             if ratio == 100:
                 break
     if bestmatch:
         msg = "found match: name:{}, img:{}, ratio:{} ".format(name_slug, bestmatch, ratio)
         logger.debug(msg)
         with open(bestmatch, "rb") as source:
             content = File(source)
             img = ProfileImage()
             img.source_file.save(name_slug_title, content)
         self.byline_photo = img
         self.save()
         return img
def findCoord(fileName, sttlReg, fWriter):
  """
  Finds the coordinates, last region and the province belonging to for the sttls from cornu file
  and write them all together in a csv file.
  """
  sttlName = sttlReg.split('-')[0]
  with open(fileName, "r", encoding="utf8") as jsonFile:    
    allData = json.load(jsonFile)
    for d in allData["data"]:
      fName = d["arTitle"]
      sName = d["arTitleOther"].split(",")
      #print("stl: ", sttlName, "stl normal: ", normalizeArabic(sttlName), 
#      if name == fName:
      # check if it finds similar words with arTitle, using fuzzywuzzy library
      if sttlReg and fuzz.ratio(normalizeArabic(sttlName), normalizeArabic(fName))>= 90:
#      if sttlReg and normalizeArabic(sttlName) == normalizeArabic(fName):
          fWriter.writerow([sttlName, fName, "/".join(sName), d["lat"], d["lon"], d["region"], sttlReg.split('-')[1], sttlReg.split('-')[2], d["eiSearch"], d["translitTitle"], fuzz.ratio(normalizeArabic(sttlName), normalizeArabic(fName))])
      else:
        for n in sName:
          n = n.strip()
#          if name == n.strip():
          # check if it finds similar words with arTitleOther, using fuzzywuzzy library
          if sttlReg and fuzz.ratio(normalizeArabic(sttlName), normalizeArabic(n))>= 90:
#          if sttlReg and normalizeArabic(sttlName) == normalizeArabic(n):
              fWriter.writerow([sttlName, fName, n, d["lat"], d["lon"], d["region"], sttlReg.split('-')[1], sttlReg.split('-')[2], d["eiSearch"], d["translitTitle"], fuzz.ratio(normalizeArabic(sttlName), normalizeArabic(n))])
              break
Ejemplo n.º 3
0
 def get_byline_image(self, force_new=False):
     slugify = Slugify(to_lower=True)
     if not force_new and self.byline_photo:
         return self.byline_photo
     imagefiles = glob.glob(BYLINE_PHOTO_FOLDER + '/*.jpg')
     name = self.name.lower()
     name_last_first = re.sub(r'^(.*) (\S+)$', r'\2 \1', name)
     name_slug = slugify(name) + '.jpg'
     name_slug_reverse = slugify(name_last_first) + '.jpg'
     bestratio = 90
     bestmatch = None
     for path in imagefiles:
         filename = os.path.split(path)[1].lower()
         ratio = max(
             fuzz.ratio(filename, name_slug),
             fuzz.ratio(filename, name_slug_reverse)
         )
         if ratio > bestratio:
             bestmatch = path
             bestratio = ratio
             if ratio == 100:
                 break
     if bestmatch:
         msg = 'found match: name:{}, img:{}, ratio:{} '.format(
             name_slug, bestmatch, ratio)
         logger.debug(msg)
         img, _ = ImageFile.objects.get_or_create(source_file=bestmatch)
         img.autocrop()
         self.byline_photo = img
         self.save()
         return img
Ejemplo n.º 4
0
 def has_similar(self, url):
     '''
     '''
     if url == "":
         return (False,[], [])
     if self.has_node( url):
         return (True,[], self.search_url_index(url))
     else:
         max_rate = 0
         url_split_list, host_po = up.url_split( url)
         max_list =[]
         max_url = ""
         rate = 0
         for my_url in self.treeContent:
             if fuzz.ratio( os.path.splitext( my_url)[1], os.path.splitext( url)[1])<SIMILAR_THRESHOLD:
             # if not has the same expend name
                 continue
             if (fuzz.ratio(url, my_url)/100.0)< self.SIMILAR_THRESHOLD:
                 continue
             my_url_list, my_host_po = up.url_split(my_url)
             rate, dismatch_list = up.url_list_compare(url_split_list, my_url_list)
             rate = max( fuzz.ratio(url, my_url)/100.0, rate)
             if max_rate < rate:
                 max_rate = rate
                 max_list = dismatch_list
                 max_url = my_url
         if max_rate > self.SIMILAR_THRESHOLD and max_url != "":
             print "SIMILAR URL(Tree.has_similar):\n",max_url,"\n",url
             return (True, dismatch_list, self.search_url_index(max_url))
         return (False, max_list, [])
Ejemplo n.º 5
0
def parseTweet(tweet):
	tweet['created_at'] = tweet['created_at'].split('+')[0]
	tweet['timestampint'] = int(tweet['timestamp_ms'])
	isFirst = True
	words = map(lambda x:x.lower(), tweet["text"].split(' '))
	newTweets = []
	theTweet = tweet
	prevWord = None
	for location in LOCATIONS:
		for word in words:
			score = fuzz.ratio(word, location)
			if score > scoreMap[location]: 
				if not isFirst:
					tweet = cloneTweet(theTweet)
				tweet["location"] = location
				isFirst = False
				newTweets.append(tweet)
				break
			score = fuzz.ratio(word, 'nagar')
			if score > 90:
				score = fuzz.ratio(prevWord+word, location)
				if score > scoreMap[location]: 
					if not isFirst:
						tweet = cloneTweet(theTweet)
					tweet["location"] = location
					isFirst = False
					newTweets.append(tweet)
					break
			prevWord = word
	for key, values in TRANSPORT_MAP.iteritems():
		done = False
		for value in values:
			if done:
				break
			for word in words:
				score = fuzz.ratio(word, value)
				if score > scoreMap[value]:
					if not isFirst:
						tweet = cloneTweet(theTweet)
					tweet["transport"] = key
					isFirst = False
					newTweets.append(tweet)
					done = True
					break
	for key, values in SERVICE_MAP.iteritems():
		done = False
		for value in values:
			if done:
				break
			for word in words:
				score = fuzz.ratio(word, value)
				if score > scoreMap[value]:
					if not isFirst:
						tweet = cloneTweet(theTweet)
					tweet["service"] = key
					isFirst = False
					newTweets.append(tweet)
					done = True
					break
	return newTweets
Ejemplo n.º 6
0
    def _employees(self, company_name="", keyword=""):
        ''' Linkedin Scrape '''
        # TODO - add linkedin directory search
        ''' Linkedin Scrape'''
        args = '-inurl:"/dir/" -inurl:"/find/" -inurl:"/updates"'
        args = args+' -inurl:"job" -inurl:"jobs2" -inurl:"company"'
        qry = '"at {0}" {1} {2} site:linkedin.com'
        qry = qry.format(company_name, args, keyword)
        results = Google().search(qry, 10)
        results = results.dropna()
        results = Google()._google_df_to_linkedin_df(results)
        _name = '(?i){0}'.format(company_name)
        if " " in company_name:
            results['company_score'] = [fuzz.partial_ratio(_name, company) 
                                        for company in results.company]
        else:
            results['company_score'] = [fuzz.ratio(_name, company) 
                                        for company in results.company]
        if keyword != "":
            results['score'] = [fuzz.ratio(keyword, title) 
                                for title in results.title]
            results = results[results.score > 75]

        results = results[results.company_score > 64]
        results = results.drop_duplicates()
        data = {'data': results.to_dict('r'), 'company_name':company_name}
        CompanyExtraInfoCrawl()._persist(data, "employees", "")

        job = rq.get_current_job()
        print job.meta.keys()
        if "queue_name" in job.meta.keys():
          if RQueue()._has_completed(job.meta["queue_name"]):
            q.enqueue(Jigsaw()._upload_csv, job.meta["company_name"])
        return results
Ejemplo n.º 7
0
    def Handle(self, query):
        voters = Voter.objects.exclude(last_name=None).exclude(first_name=None)
        ## if searchTerm.date_of_birth != None and searchTerm.date_of_birth != "":
        ##     voters = voters.filter(date_of_birth=searchTerm.date_of_birth)

        ## Validate and initialize response results
        validationMessages = self.Validate(query)
        firstNameMatches = []
        lastNameMatches = []

        if validationMessages == None:
            ## Pull voters with the same initials
            voters = self.FilterByInitial(query, voters)

            ## Run search algorithm on first and last names
            for voter in voters:
                voter.first_name_ratio = fuzz.ratio(query.last_name, voter.last_name)
                if voter.first_name_ratio > SearchVotersQueryHandler.RatioThreshold: lastNameMatches.append(voter)
            for voter in lastNameMatches:
                voter.last_name_ratio = fuzz.ratio(query.first_name, voter.first_name)
                if voter.last_name_ratio > SearchVotersQueryHandler.RatioThreshold: firstNameMatches.append(voter)
                
            # sorting
            firstNameMatches.sort(key=lambda x: x.ratioAvg(), reverse=True)
        return { 'voters': firstNameMatches[:SearchVotersQueryHandler.MaxResults], 'searchTerm': query, 'validation': validationMessages }
def fuzzy_match(college_niche_row, admithub_name, highest_match):
	higher = False
	college_niche_name = college_niche_row[1]
	if fuzz.ratio(re.sub(r"-", " ",college_niche_name.lower() ), re.sub(r"-", " ", admithub_name.lower() ) ) > highest_match:
		highest_match = fuzz.ratio(re.sub(r"-", " ",college_niche_name.lower() ), re.sub(r"-", " ", admithub_name.lower() ) )
		higher = True
	return highest_match, higher
Ejemplo n.º 9
0
    def _get_tvmaze_data(self, show, title):
        show_r = requests.get('http://api.tvmaze.com/search/shows', params={'q': show})
        show_search_data = json.loads(show_r.text)
        best_show_ratio = 0
        best_show_name = None
        if not self.tvmaze_show_id:
            for i in show_search_data:
                show_ratio = fuzz.ratio(show, i['show']['name'])
                if  show_ratio > best_show_ratio:
                # name_match_ratio = SequenceMatcher(None, show.lower(),
                #                                    show_search_data[i]['show']['name'].lower()).ratio()
                # if name_match_ratio > 0.9:
                    self.tvmaze_show_id = i['show']['id']
                    best_show_ratio = show_ratio
                    best_show_name = i['show']['name']
                    break
            if self.tvmaze_show_id is None:
                raise ValueError('Could not match TV show data')
            print('I believe the show name is {}'.format(best_show_name))

        if not self.tvmaze_episode_data:
            episode_r = requests.get('http://api.tvmaze.com/shows/{}/episodes?specials=1'.format(self.tvmaze_show_id))
            self.tvmaze_episode_data = json.loads(episode_r.text)
        best_ratio = 0
        best_obj = None
        for i in self.tvmaze_episode_data:
            episode_ratio = fuzz.ratio(title, i['name'])
            # print('{} - {} = {}'.format(title, i['name'], episode_ratio))
            if episode_ratio > best_ratio:
                best_ratio = episode_ratio
                best_obj = i
        print('I believe the episode name is {} (s{} e{})'.format(best_obj['name'],
                                                                  best_obj['season'],
                                                                  best_obj['number']))
        return best_obj
Ejemplo n.º 10
0
def fuzz_roadnames(roadn):
    #create empty lists for testing
    fuzztest = []
    fuzztestNoSP = []

    #loop through road names and append the fuzzy result to the fuzztest and NoSpace lists
    for cn in roadn:
            #testname lowercase
            fuzztest.append(fuzz.ratio(cn,testname))
            #testname without spaces
            fuzztestNoSP.append(fuzz.ratio(cn,testnameNoSP))

    # force ratio list results into a numpy array
    # find max ratio and indicies where max ratio exists
    fuzztest = np.array(fuzztest)
    maxrat = np.max(fuzztest)
    indies = np.where(fuzztest == maxrat)

    fuzztestNoSP = np.array(fuzztestNoSP)
    maxratNoSP = np.max(fuzztestNoSP)
    indiesNoSP = np.where(fuzztestNoSP == maxratNoSP)
    print '\n\n\n\n\nREGULAR TEST'
    for i in indies[0]:
            print roadn[i]
    print '\n\n\n\n\nNO SPACE TEST'
    
    for i in indiesNoSP[0]:
            print roadn[i]
Ejemplo n.º 11
0
def get_twitter_facebook_google_id(url_instance, url_link):

    twitter_base = 'twitter.com/'
    facebook_base = 'facebook.com/'
    google_base = 'play.google.com/store/apps/details?'
    request = url_instance
    links = re.findall('"((http|ftp)s?://.*?)"', str(request))
    twitter_IDs = []
    facebook_IDs = []
    google_IDs = []
    if len(links) == 0:
        return twitter_IDs, facebook_IDs, google_IDs
    for link in links:
        link = link[0]
        if twitter_base in link:
            twitter_IDs += [link]
        elif facebook_base in link:
            facebook_IDs += [link]
        elif google_base in link:
            google_IDs += [link]

    twitter_IDs = [link.split('.com/', 1)[1] for link in twitter_IDs]
    twitter_return = [ID for ID in twitter_IDs if
                      fuzz.ratio(ID, url_link) > 75]

    facebook_IDs = [link.split('.com/', 1)[1] for link in facebook_IDs]
    facebook_return = [ID for ID in facebook_IDs if
                       fuzz.ratio(ID, url_link) > 75]

    google_IDs = [link.split('.com/store/apps/details?', 1)[1]
                  for link in google_IDs]
    google_return = [ID for ID in google_IDs]

    return twitter_return, facebook_return, google_return
Ejemplo n.º 12
0
	def get(self, count):
		self.set_header("Content-Type", "application/json")
		title, artist = self.get_argument("title"), self.get_argument("artist")
		title, artist = clean(title), clean(artist)
		url = "http://developer.echonest.com/api/v4/song/search?api_key=%s&format=json&results=100&artist=%s&combined=%s"
		origin = self.request.remote_ip
		reqHeader = {'X-Forwarded-For': origin}
		echoTracks = []
		try:
			reqUrl = url % (echoKeys.next(), quote_plus(artist), quote_plus(title + " " + artist))
			getReq = tornado.httpclient.HTTPRequest(reqUrl, headers=reqHeader)
			res = yield tornado.gen.Task(async_client.fetch, getReq)
			response = json.loads(res.body)['response']
			if len(response['songs'])>0:
				results = response['songs']
				for r in results:
					score = fuzz.ratio(title, clean(r['title'])) + fuzz.ratio(artist, clean(r['artist_name']))
					r['score'] = (score/200.0)*100
				sorted_results = sorted(results, key=lambda r: r['score'])[::-1]
				echoTracks = [d for d in sorted_results if d['score']>=90][0:int(count)]
		except Exception as e:
			pass
		finally:
			self.write(json.dumps(echoTracks))
			self.finish()
Ejemplo n.º 13
0
def fuzzName(src_table,dst_table,catid):
	table_name = "result_"+catid+"_"+time.strftime("%Y_%m_%d_%H_%M_%S")
	table_query = "CREATE TABLE "+table_name+" (src_game_name mediumtext, dst_game_name mediumtext, src_buying_price float, dst_selling_price float, has_number int, match_ratio int)"
	my_cursor.execute(table_query)

	src_query = 'select game_name, selling_price, buying_price from '+src_table
	my_cursor.execute(src_query)
	src_rows = my_cursor.fetchall()

	dst_query = 'select game_name, selling_price, buying_price from '+dst_table
	my_cursor.execute(dst_query)
	dst_rows = my_cursor.fetchall()

	src_names = [src_row[0].strip() for src_row in src_rows]
	dst_names = [dst_row[0].strip() for dst_row in dst_rows]
	match = [src_name for src_name in src_names for dst_name in dst_names if src_name == dst_name]

	for m in match:
		fillResultTable(table_name, src_table, dst_table, m, m, '0', '100')
		if m in dst_names:
			dst_names.remove(m)
		if m in src_names:
			src_names.remove(m)	

	for src_name in src_names:
		if hasNumbers(src_name) == False:
			for dst_name in dst_names:
				if hasNumbers(dst_name) == False:
					ratio = fuzz.ratio(src_name,dst_name)
					if ratio >= 80:
						print src_name+"--"+dst_name
						fillResultTable(table_name, src_table, dst_table, src_name, dst_name, '0', ratio)
						if dst_name in dst_names:
							dst_names.remove(dst_name)
						if src_name in src_names:
							src_names.remove(src_name)
						break
		else:
			for dst_name in dst_names:
				ratio = fuzz.ratio(src_name,dst_name)
				if hasNumbers(src_name[-1]) and hasNumbers(dst_name[-1]):
					if ratio >= 80 and src_name[-1] == dst_name[-1]:
						print src_name+"--"+dst_name
						fillResultTable(table_name, src_table, dst_table, src_name, dst_name, '1', ratio)
						if dst_name in dst_names:
							dst_names.remove(dst_name)
						if src_name in src_names:
							src_names.remove(src_name)
						break
	db.commit()	
	query = 'alter table '+table_name+' add column price_difference float'
	my_cursor.execute(query)
	query = 'update '+table_name+' set `price_difference` = (`src_buying_price`-`dst_selling_price`)'
	my_cursor.execute(query)
	query = 'alter table '+table_name+' add column gain_percentage int'
	my_cursor.execute(query)
	query = 'update '+table_name+' set `gain_percentage` = (`price_difference`/`dst_selling_price`)*100'
	my_cursor.execute(query)
	db.commit()
Ejemplo n.º 14
0
def best_ch_fuzz(title):
    highest_ratio = 0
    best_match = u''
    for key in ch_titles.keys():
        if fuzz.ratio(title,key)>highest_ratio:
            best_match=key
            highest_ratio=fuzz.ratio(title,key)
    return best_match
Ejemplo n.º 15
0
def _is_fuzzy_match(s1, s2, threshold=90):
    if isinstance(s2, dict):
        best_match = process.extractOne(s1, s2.values(), score_cutoff=threshold)
        return True if best_match else False
    #for debugging TODO: REMOVE
    ratio = fuzz.ratio(s1, s2)
    # print '{0} ==> {1} :: {2}'.format(s1, s2, ratio)
    return fuzz.ratio(s1, s2) > threshold
def highest_fuzz(input_list, input_item):
    highest_ratio = 0
    best_match = u''
    for item in input_list:
        if fuzz.ratio(input_item,item)>highest_ratio:
            best_match=item
            highest_ratio=fuzz.ratio(input_item,item)
    return best_match
Ejemplo n.º 17
0
def get_sefaria_english_parsha(parsha_name):
    highest_ratio=0
    return_title = 0
    for sefaria_parsha_name in eng_parshiot:
        if fuzz.ratio(parsha_name,sefaria_parsha_name)>highest_ratio:
            return_title=sefaria_parsha_name
            highest_ratio=fuzz.ratio(parsha_name,sefaria_parsha_name)
    return return_title
Ejemplo n.º 18
0
def show_folders(folders, mdfind_results3, searchword, skiplist):
    """
    @type folders: list
    @type mdfind_results3: list
    @type searchword: str
    @type skiplist: list
    @return: None
    """
    folders2 = []

    if len(folders) > 0 and len(mdfind_results3) < 50:
        print()
        print("\033[91m[" + searchword + "] Folders:\033[0m")
        last = None
        nextcnt = 0

        for i in folders:
            nextcnt += 1
            skip = False
            skipi = i.lower()
            nexti = None
            try:
                nexti = folders[nextcnt]
            except IndexError:
                pass

            for item in skiplist:
                item = item.lower()

                if item in skipi:
                    skip = True

            if last and fuzz.ratio(i, last) > 85:
                skip = True

            if len(folders) < 10:
                skip = False

            if not skip:
                # if last and (pp(i) == pp(last) or fuzz.ratio(i, nexti) > 70):
                #    folders2.append("\033[90m" + str(i) + "\033[0m")
                # else:
                newi = ""

                if nexti:
                    if fuzz.ratio(i, nexti) > 90:
                        newi = "\033[90m" + str(os.path.dirname(i)) + "\033[34m/" + str(os.path.basename(i)) + "\033[0m"

                if newi == "":
                    newi = "\033[34m" + str(i) + "\033[0m"

                folders2.append(newi)

    folders2.sort(key=lambda x: (x.count("/"), len(x), x))
    folders2.reverse()

    for i in folders2:
        print(i)
def analysis():
    MID_MANAGER_LEVEL = ['senior', 'sr']
    MANAGER_LEVEL = ['manager', 'lead', 'head',  'leader', 'gerente','specialist']
    DIRECTOR_LEVEL = ['director', 'partner', 'general', 'managing', 'gm', 'dgm', 'agm']
    BOARD_LEVEL = ['president','md','vice','vp', 'avp', 'entrepreneur', 'owner', 'proprietor', 'chairman', 'founder', 'board', 'chief', 'ceo', 'cto', 'cfo', 'coo', 'cro', 'cmo', 'cso', 'cio']
    mongo_server = MongoClient(['dev-mongo2.grownout.com:27017','dev-mongo1.grownout.com:27017'],replicaset='amoeba-mongo')
    designation_db = mongo_server['designation_database']
    designation_prob_collection = designation_db.prob_collection

    designation_probs={}


    for designation in designation_prob_collection.find():
        designation_probs[designation['name']] = designation


    # naukri_tittle_exp={}
    print designation_probs
    for k in jobs.find():
        
        designation=k['title'].replace("-"," ")
        designation=re.sub('[^0-9a-zA-Z]+', ' ', designation)
        designation_words=designation.split(" ")
        prob_designation=[0.0]*4
        prob_experience=[0.0]*4
        for word in designation_words:
            
            if word.lower() in designation_probs:
                # print "entered true"
                prob_designation[0]+=float(designation_probs[word.lower()]['level_1'])
                prob_designation[1]+=float(designation_probs[word.lower()]['level_2'])
                prob_designation[2]+=float(designation_probs[word.lower()]['level_3'])
                prob_designation[3]+=float(designation_probs[word.lower()]['level_4'])
        # print prob_designation,"this is prob_designation", "  ",designation
        # break
        design_probs =prob_designation.index(max(prob_designation))
        # print design_probs
        match_designation=[0.0]*4
        for j in MID_MANAGER_LEVEL:
            match_designation[0]=max(fuzz.ratio(designation,j),match_designation[0])
        for j in MANAGER_LEVEL:
            match_designation[1]=max(fuzz.ratio(designation,j),match_designation[1])
        for j in DIRECTOR_LEVEL:
            match_designation[2]=max(fuzz.ratio(designation,j),match_designation[2])
        for j in BOARD_LEVEL:
            match_designation[3]=max(fuzz.ratio(designation,j),match_designation[3])
        min_exp_probs=prob_workex(float(k['experience']['min']))
        max_exp_probs=prob_workex(float(k['experience']['max']))
        # print "min ",min_exp_probs
        # print "max ",max_exp_probs
        for i in range(4):
            prob_experience[i]+=(((min_exp_probs[i])*0.8)+((max_exp_probs[i])))/1.8
        # print prob_experience," prob experience"
        design_exp=prob_experience.index(max(prob_experience))
        # print match_designation,"this is match designation"
        design_match=match_designation.index(max(match_designation))
        # if design_exp!=design_probs:
        print design_match,":",design_probs,":",design_exp,"     ",k['title']
Ejemplo n.º 20
0
def compare_strings(string_one, string_two):
    highest_ratio = 0
    if fuzz.ratio(string_one, string_two)>highest_ratio:
        highest_ratio = fuzz.ratio(string_one, string_two)
    if fuzz.token_sort_ratio(string_one, string_two)>highest_ratio:
        highest_ratio = fuzz.token_sort_ratio(string_one, string_two)
    if fuzz.token_set_ratio(string_one, string_two)>highest_ratio:
        highest_ratio = fuzz.token_set_ratio(string_one, string_two)
    return highest_ratio
Ejemplo n.º 21
0
 def OnChanMsg(self, nick, channel, message):
     if not channel.GetName() == "#xshellz":
         return znc.CONTINUE
     msg = str(message)
     msg = msg.split(" ")
     cmd = msg[0].lower()
     nickn = nick.GetNick()
     try:
         username = msg[1].lower()
     except IndexError:
         return znc.CONTINUE
     if cmd == '!keep' or cmd == '!approve':
         self.buffer[nickn] = dict(ni=nick,user=username)
     else:
         if not nickn == 'xinfo':
             return znc.CONTINUE
         res = self.appr.search(msg)
         if not res:
             res = self.kp.search(msg)
         if not res:
             return znc.CONTINUE
         nck = res.group(1)
         user = res.group(2)
         nd = self.buffer.get(nck,None)
         if nd:
             self.buffer.pop(nck,None)
             if user == nd['user']:
                 nu = nd['ni']
                 self.nd[user]['nick'].append(nck)
                 host = nu.GetHost()
                 if host == 'shell.xshellz.com':
                     host = None
                 self.nd['user']['hosts'].append(host)
                 for z,x in self.nd.items():
                     if z == user:
                         continue
                     if nck in x['nick']:
                         self.svreport("{0} is in the nick list for {1} ({2}) but user requested is {3} !att-nick-match".format(nck, z, " ,".join(x['nick']),user))
                         got = True
                     else:
                         for y in x['nick']:
                             if fuzz.ratio(y,host) >= 50:
                                 self.svreport("{0} is a fuzzy match against {1}'s nick {2}, but the user is {3} !att-nick-fuzzy-match".format(nck,z,y,user))
                     if host in x['hosts']:
                         got = True
                         self.svreport("{0}'s host ({4}) is in the host list for {1} ({2}) but user requested is {3} !att-host-match".format(nck, z, " ,".join(x['hosts']),user,host))
                     else:
                         for y in x['hosts']:
                             if fuxx.ratio(y,host) >= 95:
                                 self.svreport("{0} is a fuzzy match against {1}'s host {2}, but the user is {3} !att-host-fuzzy-match".format(host,z,y,user))
                     if fuzz.ratio(z,user) >= 50:
                         self.svreport("{0} is a fuzzy match against {1} !att-user-fuzzy-match".format(z,user))
                     if got:
                         break
                 self.write()
         return znc.CONTIUE
Ejemplo n.º 22
0
    def print_list(self, match=0):
	for key, value in self._items.iteritems():
		if not type(value).__name__ == "KeychainItem":
			if not match:
				self.stdout.write("%s\n" % key)
			else:
				regex = re.compile(".*"+match+".*")
				if (fuzz.ratio(match, key) > 50) or (fuzz.ratio(match, key) > 25 and regex.match(key)):
					self.stdout.write("%s\n" % key)
	return
def validate(document, parsed_query): # Need to write
    raw_content = extraction.get_raw_content(document)
    extract_text = extraction.get_text(document)
    matchword = parsed_query["required_match_field"]  #Validation fields
    lower_raw_content = raw_content.lower()
    lower_extract_text = extract_text.lower()
    for feature in matchword:
        isValid = False
        #Check first if the feature value in text or not
        if type(matchword[feature]) is list:
            for value in matchword[feature]:
                if value.lower() in lower_raw_content or value.lower() in lower_extract_text: #Either in the raw_content or extract_text is regarded as valid
                    isValid = True
                    break
        else:
            if feature == "location": #Special case for location which has two fields(city and state) in value
                location_fields = [field.strip() for field in matchword[feature].split(",")]
                for location_field in location_fields:
                    if location_field in state_abbr_dic: #Validate state should be case sensitive
                        state_pattern = r"(?:[^A-Za-z])("+location_field+")(?:[^A-Za-z])"
                        if re.search(state_pattern,raw_content) or re.search(state_pattern,extract_text) or state_abbr_dic[location_field].lower() in lower_extract_text or state_abbr_dic[location_field].lower in lower_raw_content: #Check if a state abbr or its full name is in raw_content or extracted_text field
                            isValid = True
                    else:
                        if location_field.lower() in lower_raw_content or location_field.lower() in lower_extract_text:
                            isValid = True
            else:
                if matchword[feature].lower() in lower_raw_content or matchword[feature].lower() in lower_extract_text:
                    isValid = True
        if isValid:
            continue
        #If the document does not contain the raw string, do extractions and match
        results = extraction.functionDic[feature](document,True)
        for result in results:
            if feature == "phone": #phone number has to be exactly the same while other features tolerate some minor difference
                if result == re.sub("\D","",matchword[feature]):
                    isValid = True
            else:
                if fuzz.ratio(str(result),matchword[feature])>=80:
                    isValid = True
                    break
        if extract_text:
            results = extraction.functionDic[feature](document,False)
            for result in results:
                if feature == "phone": #phone number has to be exactly the same while other features tolerate some minor difference
                    if result == re.sub("\D","",matchword[feature]):
                        isValid = True
                else:
                    if fuzz.ratio(str(result),matchword[feature])>=80:
                        isValid = True
                        break
        if isValid:
            continue
        else:
            return False
    return True
Ejemplo n.º 24
0
def get_article_groups(article_list, fuzz_len=3000, min_ratio=70):
    """
		finds duplicate articles in a list of articles.
		
		arguments:
			article_list: list of articles
			fuzz_len (int, default=3000): number of characters at 
				beginning/end of article to compare against using fuzz
			min_ratio (int, default=70): minimum fuzz score (as percentage)
				above which two articles are considered duplicates.

		returns:
			list of ints for each article in article_list, where each distinct int
				corresponds to a distinct group of duplicate articles.
		
	"""
    if len(article_list) == 1:
        return [article_list]
    else:
        rep_articles = []
        rep_ids = []
        rep_num = -1
        fuzzratio = 0
        for article in article_list:
            a_content = article[3]
            a_len = len(a_content)
            matched = False
            for i in range(len(rep_articles)):
                rep_content = rep_articles[i][3]
                rep_len = len(rep_content)
                minlen = min(fuzz_len, a_len, rep_len)
                fuzzratio = fuzz.ratio(rep_content[:minlen], a_content[:minlen])
                # we compare the beginnings and ends of both articles using fuzz.
                # it's possible that some of these checks are mostly useless; this is slow.
                # but for deduplication I've chosen to be careful.
                if fuzzratio < min_ratio:
                    fuzzratio = fuzz.ratio(rep_content[-minlen:], a_content[-minlen:])
                if fuzzratio < min_ratio:
                    fuzzratio = fuzz.ratio(rep_content[-minlen:], a_content[:minlen])
                if fuzzratio < min_ratio:
                    fuzzratio = fuzz.ratio(a_content[-minlen:], rep_content[:minlen])
                if fuzzratio >= min_ratio:
                    rep_ids.append(i)
                    matched = True
                    break
            if not matched:
                rep_num += 1
                rep_ids.append(rep_num)
                rep_articles.append(article)
        groups = []
        # assign duplicate article groups to each article
        for i in range(len(rep_articles)):
            groups.append([article_list[j] for j in range(len(article_list)) if rep_ids[j] == i])
        return groups
Ejemplo n.º 25
0
def fuzz_roadnames_num_test(roadn):
    fuzztestNUM = []
    for cn in roadn:
            fuzztestNUM.append(fuzz.ratio(re.findall("\d+",cn),testnameNUM))
            fuzztestNUM.append(fuzz.ratio(cn,testnameNUM))

    fuzztestNUM = np.array(fuzztestNUM)
    maxratNUM = np.max(fuzztestNUM)
    indiesNUM = np.where(fuzztestNUM == maxratNUM)
    print '\n\n\n\n\nNUM TEST'
    for i in indiesNUM[0]:
            print roadn[i]
Ejemplo n.º 26
0
def xlparse(filepath,classname):
    wb = openpyxl.load_workbook(filepath)
    sheetnames = wb.get_sheet_names() #getting list of all sheetnames

    #section for dealing with sheet 1 which is consolidated statement of income
    sheet = wb.get_sheet_by_name(sheetnames[1])
    cola = [] #create new list to store all values in column A
    colb = []
    for cellobj in sheet.columns[0]: #iterate through all values where column0*header column is true..
        cola.append(cellobj.value) #append them to list of columna so we can extract info out of it..

    for cellobj in sheet.columns[1]: #iterate through all values in columnB and store in colb
        colb.append(cellobj.value)

    #extracting information on millions vs thousands from columnArowA
    #format 'Consolidated Condensed Statements of Income - USD ($) shares in Millions, $ in Millions'

    sharedeno = re.findall('shares in ([a-zA-Z]+)',cola[0])
    dollardeno = re.findall('\$ in ([a-zA-Z]+)',cola[0]) #note the use of special character '\' to match for $ vs match at end of line..
    #print sharedeno[0],dollardeno[0] #regular expression returns a list


    #netrevenue  -cost ofsales = grossmargin 
    #grossmargin - operating expenses = operating income
    #operating expenses = randd + marketing + restructuring + amoritization
    #operating income -   gains(losses) on equity investments + iterests and other... = income before taxes
    #income before taxes - provision for taxes = net incomde
    #netincome/basic shares = basic earnings per share 
    #netincome/diluted shares = diluted earnings per share of commone stock

    netrevenuelist = ['Net sales','Netsales','netsales','Net revenue','netrevenue'] #we will want to keep expanding this list based on how many different variants from different companies                                                               
    netincomelist = ['Net income','netincome'] #we will want to keep expanding this list based on how many different variants from different companies         

    logger.debug('%s,%s',cola,colb)
    for name,value in zip(cola,colb): #looping through both cola and b at the same time .. maybe not efficient time wise and better to use index ??
        for entry in netrevenuelist:
            if fuzz.ratio(entry,name) > 70:
                #TODO Add a check for blank cells
                logger.info('%s,%s',name,fuzz.ratio(entry,name))
                netrevenue = value
                break
        for entry in netincomelist:
            if fuzz.ratio(entry,name) > 70:
                #TODO Add a check for blank cells
                logger.info('%s,%s',name,fuzz.ratio(entry,name))
                netincome = value
                break
            
    #print netrevenue,netincome
    classname = quarterlyincome(netrevenue,netincome)    #creating a new class with income vlaues 
    
    return classname
Ejemplo n.º 27
0
def parse_track(spotify, line):

	search_text = line
	if search_text.count(" by ") == 1:
		search_text = search_text.replace(" by ", " ")
	if search_text.count(" - ") == 1:
		search_text = search_text.replace(" - ", " ")
	if search_text.count("-") == 1:
		search_text = search_text.replace("-", " ")

	if search_text.strip() == "":
		return None

	log("  Searching for " + search_text + " AND NOT Karaoke..", 3)
	try:
		spotify_login()
		results = spotify.search(search_text + " AND NOT Karaoke", limit=50, type='track')
	except Exception as err:
		log("Error searching for track", 1)
		log(str(err), 1)
		return None

	log("  Searching for track finished", 3)

	items = results['tracks']['items']

	choices = []
	track_hash = {}

	if len(items) > 0:
		for t in items:
			log("  Appending choice: " + t['artists'][0]['name'] + " " + t['name'], 3)
			choices.append(t['artists'][0]['name'] + " " + t['name'])
			track_hash[t['artists'][0]['name'] + " " + t['name']] = t
			log("  Appending choice: " + t['name'] + " " + t['artists'][0]['name'], 3)
			choices.append(t['name'] + " " + t['artists'][0]['name'])
			track_hash[t['name'] + " " + t['artists'][0]['name']] = t
			#choices.append(t['name'])
			#track_hash[t['name']] = t

		best_track = process.extractOne(search_text, choices)
		best_t = track_hash[best_track[0]]

		log("  Closest match: " + best_track[0] + " (" + str(best_track[1]) + ")" + " for comment [" + search_text + "]", 3)

		if fuzz.ratio(line, best_track[0]) > 50:
			log("  Returning track " + best_t['name'] + " for comment [" + line + "]", 2)
			return best_t
		else:
			log("  Fuzz ratio discarding '" + best_track[0] + "' with score: " + str(fuzz.ratio(line, best_track[0])), 3)

	return None
Ejemplo n.º 28
0
 def compare_two_texts(self, string_a, string_b, normalize_value=True):
     """
     Compare two string and return the value of Simple Ratio algorithm
     the value is normalized between 0 and 1 values.
     """
     if ((isinstance(string_a, unicode) and isinstance(string_b, unicode)) or
             (isinstance(string_a, str) and isinstance(string_b, str))):
         if normalize_value:
             return self.__normalized_value(fuzz.ratio(string_a, string_b))
         else:
             return fuzz.ratio(string_a, string_b)
     else:
         raise TypeError
Ejemplo n.º 29
0
Archivo: run.py Proyecto: ed/portify
def get_spotify_id(t):
    artist = t[0]
    track = t[1]
    url ="https://api.spotify.com/v1/search?q=%s+artist:%s&type=track&market=US&" % (track, artist)
    r = urllib.request.urlopen(url)
    data = json.load(r)
    PopId = namedtuple('PopId', 'pop spotify_id')
    popid_list = []
    for i in range(0, len(data["tracks"]["items"])):
        if fuzz.ratio(artist, data["tracks"]["items"][i]["artists"][0]["name"].lower()) > 60:
            if fuzz.ratio(track, data["tracks"]["items"][i]["name"].lower()) >= 60:
                s_id = data["tracks"]["items"][i]["id"]
                return s_id
    raise ValueError('track not found')
Ejemplo n.º 30
0
def response_correct(response, answer):
    filtered_response = filter_words(response)
    filtered_answer = filter_words(answer)
    bracketless = strip_brackets(filtered_answer)
    no_whitespace_response = filtered_response.replace(" ", "")
    no_whitespace_answer = filtered_answer.replace(" ", "")
    no_whitespace_bracketless = bracketless.replace(" ", "")
    score = max(
        fuzz.token_sort_ratio(filtered_response, filtered_answer),
        fuzz.token_sort_ratio(filtered_response, bracketless),
        fuzz.ratio(no_whitespace_response, no_whitespace_answer),
        fuzz.ratio(no_whitespace_response, no_whitespace_bracketless)
    )
    return score > 70
Ejemplo n.º 31
0
            #     print("Both O and X")
            # print("Average Precision:")
            # print("{0:<50f}{1:<50f}".format(ap, ap_old))
            # past_avep += ap
            # past_avep_old += ap_old

            # Coverage Scoring
            all_recommendation_set = all_recommendation_set | set(
                top_5_recommendation_list)
            if query_item not in seen_query:
                seen_query.add(query_item)
                total_rec_num += len(top_5_recommendation_list)

            # Edit Distance Scoring
            edit_distance_query_to_rec = \
                    [fuzz.ratio(item_detail_map[query_item], item_detail_map[rec_item]) for rec_item in top_5_recommendation_list]
            ave_edit_distance_query_to_rec = sum(
                edit_distance_query_to_rec) / len(edit_distance_query_to_rec)
            total_ave_distance_query_to_rec += ave_edit_distance_query_to_rec
            print('=' * 20)
            print("Levenshtein Distance:")
            print("Average Distance (query to rec): {}".format(
                ave_edit_distance_query_to_rec))
            edit_distance_rec_to_rec = \
                    [fuzz.ratio(item_detail_map[rec_item_first], item_detail_map[rec_item_second])\
                     for rec_item_first, rec_item_second in combinations(top_5_recommendation_list, 2)]
            ave_edit_distance_rec_to_rec = sum(edit_distance_rec_to_rec) / len(
                edit_distance_rec_to_rec)
            total_ave_distance_rec_to_rec += ave_edit_distance_rec_to_rec
            print("Average Distance (rec to rec): {}".format(
                ave_edit_distance_rec_to_rec))
	def init_agents(population, string_length):

		return (Agent(string_length for _ in xrange(population))

	#Now we need to figure out how we do our fitness:
	def fitness(agents):

		for agent in agents:
			agent,fitness = fuzz.ratio(agent.string, in_str)

		return agents

	#We also need a function to select who was the best:
	def selection (agents):
		#So this sorts our agents and then uses a lambda function to make sure it's sorted by agent fitness.
		agents = sorted(agents, key = lambda agent: agent.fitness, reverse = True)
		print(''.join(map(str, agents)))

		#I want half of the best agents to live.
		agents = agents[:int(0.5 * len(agents))]

	#This is a confusing function. But this is crossover.
	#The idea is mixing the qualities of the agents in the current iteration with the next iteration.
	#Previously we selected the most fit parents, so the average of the whole population should be better. 
	def crossover(agents):

		#These are the babies for the next iteration:
		babies = []

		#So for each difference in our number of agents and the population... divided by 2
		for _ in xrange((population - len(agents)) / 2):

			#We randomly select parents...
			parent1 = random.choice(agents)
	        parent2 = random.choice(agents)

	        #We get our children...
	        child1 = Agent(in_str_len)
	        child2 = Agent(in_str_len)
	        #We decide where that split is (randomly, that's the point)
	        split = random.randint(0, in_str_len)

	        #We create the string that the child represents
	        child1.string = parent1.string[0:split] + parent2.string[split:in_str_len]
	        child2.string = parent2.string[0:split] + parent1.string[split:in_str_len]

	        #Now we have to append these to the babies list.
	        babies.append(child1)
	        babies.append(child2)

		#extend is weird. It just adds 2 lists.
		agents.extend(babies)
		return agents

	#Now all we need is our mutation function. I think just sport mutations will be fine:
	def mutation(agents):

		for agent in agents:
			for ix, param in enemerate(agent.string):

				if random.uniform(0.0, 1.0) <=0.1:

					agent.string = agent.string[0:idx] + random.choice(string.letters) + agent.string[idx+1:in_str_len]

		return agents


if __name__ == '__main__':

    in_str = 'AndyMiller'
    in_str_len = len(in_str)
    genetic_algorithm()
Ejemplo n.º 33
0
    master_data["Final_String_exist"]=master_data["Final_String_exist"]+" "+master_data[i].astype(str)

print(master_data["Final_String_exist"][0])
new_data["NAD Key"]=""
#output=pd.DataFrame()
#new_data["Final_String_new"]=new_data["Final_String_new"]
new_data["Final_String_exist"]=""
new_data["NAD Key"]=np.NaN
new_data["F_Ratio"]=np.NaN
new_data["Indices"]=np.NaN
output=pd.DataFrame()
output=new_data
count=1
for i in range(0,len(new_data)):
    for j in range(0,len(master_data)):        
        f_ratio=fuzz.ratio(new_data["Final_String_new"][i],master_data["Final_String_exist"][j])
        print(f_ratio)
        
        
        if f_ratio > 60:     
           # new_data.loc[new_data.index[i],"Final_String_exist"]=master_data["Final_String_exist"][j]
            if new_data.loc[new_data.index[i],"NAD Key"] is np.NaN:
                new_data.loc[new_data.index[i],"F_Ratio"]=f_ratio
                new_data.loc[new_data.index[i],"NAD Key"]=master_data["NAD Key"][j]
                new_data.loc[new_data.index[i],"Final_String_exist"]=master_data["Final_String_exist"][j]
                new_data.loc[new_data.index[i],"Indices"]=i
            else:
                new_data=new_data.append({"NAD Key":master_data["NAD Key"][j],"F_Ratio":f_ratio,"Final_String_exist":master_data["Final_String_exist"][j],"Final_String_new":new_data["Final_String_new"][i],'Primise':new_data["Primise"][i], 'ThoroughFare No':new_data["ThoroughFare No"][i], 'ThoroughFare':new_data['ThoroughFare'][i], 'ThoroughFare Name':new_data['ThoroughFare Name'][i],
       'Town':new_data['Town'][i], 'Postoutcode':new_data['Postoutcode'][i], 'PostIncode':new_data['PostIncode'][i], 'Country':new_data['Country'][i],'Indices':i},ignore_index=True)
               # new_data.loc[new_data.index[len(new_data)+1],"Final_String_exist"]=master_data["Final_String_exist"][j]
Ejemplo n.º 34
0
    def load_clue(self, coords):
        if len(coords) != 2:
            self.display_board()

        c = coords[0].lower()
        r = coords[1].lower()
        if c not in ['a', 'b', 'c', 'd', 'e', 'f']:
            self.display_board()

        current_round = self.rounds[self.current_round]
        category = sorted(list(current_round.keys()))[ord(c) - ord('a')]

        all_values = set()
        for cat in current_round:
            for v in current_round[cat]:
                all_values.add(v)

        if (int(r) - 1) in all_values:
            val = sorted(all_values)[int(r) - 1]
        else:
            return self.display_board()
        
        if val not in current_round[category]:
            self.display_board()

        clue = current_round[category][val]

        if not clue['active']:
            self.display_board()

        players = []
        for idx in range(1, len(self.players.keys()) + 1):
            players.append(f'{self.players[idx]["name"]} ({idx})')

        self.clear_screen()

        print('\n\n\n\n\n\n\n\n\n\n\n\n')

        if clue['daily_double']:
            self.print_centered('Daily Double!')
            self.print_centered(category)
            self.print_centered('\n\n\n')

            player = ''
            while player not in self.players:
                player = self.prompt_centered(f'Who\'s guessing? {"; ".join(players)}', suffix='\n ')

                if player.isdigit():
                    player = int(player)
                elif player == '':
                    return

                if f' ({player})' not in ';'.join(players):
                    player = ''
                    continue
            
            self.print_centered('\n\n\n')
            wager = self.prompt_centered(
                f'How much are you wagering, {self.players[player]["name"]}?', suffix='\n '
            )

            self.print_centered(clue['text'], wrap=True)
            self.print_centered('\n\n')

            answer = clue['answer'].lower()
            guess = self.prompt_centered('What is ', suffix='\n ').lower()
            choice = ''
            print('\n\n')

            scores = [
                fuzz.ratio(guess, answer),
                fuzz.partial_ratio(guess, answer),
                fuzz.token_sort_ratio(guess, answer),
                fuzz.token_set_ratio(guess, answer)
            ]
            if min(scores) == 100:
                self.print_centered('Correct. The official answer is:')
                choice = 's'
            elif max(scores) >= 75 and min(scores) >= 50:
                self.print_centered('Likely correct')
            elif max(scores) >= 50 and min(scores) >= 35:
                self.print_centered('Not far off')
            else:
                self.print_centered('Unlikely to be right')

            input()
            print('\n\n')

            self.print_centered(textwrap.fill(f'What is {clue["answer"]}?'))

            correct = ''
            while correct.lower() not in ['y', 'n']:
                correct = self.prompt_centered(f'Was {self.players[player]["name"]} right? Y/N', suffix='\n ')

            if correct.lower() == 'y':
                self.players[player]['points'] += int(wager)
            else:
                self.players[player]['points'] -= int(wager)

            self.rounds[self.current_round][category][val]['active'] = False
        else:
            self.print_centered(f'{category} for ${val}')
            self.print_centered('\n\n')
            self.print_centered(clue['text'], wrap=True)
            self.print_centered('\n\n')

            choice = ''
            while choice.lower() not in ['g', 's', 'c']:
                choice = self.prompt_centered('(G)uess, (S)how, (C)ancel', suffix='\n ')

                if choice.lower() == 'g':
                    answer = clue['answer'].lower()
                    guess = self.prompt_centered('What is ', suffix='\n ').lower()
                    choice = ''
                    print('\n\n')

                    scores = [
                        fuzz.ratio(guess, answer),
                        fuzz.partial_ratio(guess, answer),
                        fuzz.token_sort_ratio(guess, answer),
                        fuzz.token_set_ratio(guess, answer)
                    ]
                    if min(scores) == 100:
                        self.print_centered('Correct. The official answer is:')
                        choice = 's'
                    elif max(scores) >= 75 and min(scores) >= 50:
                        self.print_centered('Likely correct')
                    elif max(scores) >= 50 and min(scores) >= 35:
                        self.print_centered('Not far off')
                    else:
                        self.print_centered('Unlikely to be right')

                if choice.lower() == 's':
                    self.print_centered(textwrap.fill(f'What is {clue["answer"]}?'))

                    self.record_guess_result(val, players)

                    self.rounds[self.current_round][category][val]['active'] = False

        active_clues = False
        for category in self.rounds[self.current_round]:
            for val in self.rounds[self.current_round][category]:
                if self.rounds[self.current_round][category].get(val, {}).get('active', False):
                    active_clues = True

        if active_clues:
            self.display_board()
        else:
            self.display_interstitial()
Ejemplo n.º 35
0
 def get_representatives(self, word, representatives, threshold=70):
     representatives = [
         rep for rep in representatives
         if fuzz.ratio(word, rep) >= threshold
     ]
     return representatives
Ejemplo n.º 36
0
    def find_component_match(self, title, body, template_data):
        '''Make a list of matching files for arbitrary text in an issue'''

        # DistributionNotFound: The 'jinja2<2.9' distribution was not found and
        #   is required by ansible
        # File
        # "/usr/lib/python2.7/site-packages/ansible/plugins/callback/foreman.py",
        #   line 30, in <module>

        STOPWORDS = [u'ansible', u'core', u'plugin']
        STOPCHARS = [u'"', u"'", u'(', u')', u'?', u'*', u'`', u',']
        matches = []

        if u'Traceback (most recent call last)' in body:
            lines = body.split(u'\n')
            for line in lines:
                line = line.strip()
                if line.startswith(u'DistributionNotFound'):
                    matches = [u'setup.py']
                    break
                elif line.startswith(u'File'):
                    fn = line.split()[1]
                    for SC in STOPCHARS:
                        fn = fn.replace(SC, u'')
                    if u'ansible_module_' in fn:
                        fn = os.path.basename(fn)
                        fn = fn.replace(u'ansible_module_', u'')
                        matches = [fn]
                    elif u'cli/playbook.py' in fn:
                        fn = u'lib/ansible/cli/playbook.py'
                    elif u'module_utils' in fn:
                        idx = fn.find(u'module_utils/')
                        fn = u'lib/ansible/' + fn[idx:]
                    elif u'ansible/' in fn:
                        idx = fn.find(u'ansible/')
                        fn1 = fn[idx:]

                        if u'bin/' in fn1:
                            if not fn1.startswith(u'bin'):

                                idx = fn1.find(u'bin/')
                                fn1 = fn1[idx:]

                                if fn1.endswith(u'.py'):
                                    fn1 = fn1.rstrip(u'.py')

                        elif u'cli/' in fn1:
                            idx = fn1.find(u'cli/')
                            fn1 = fn1[idx:]
                            fn1 = u'lib/ansible/' + fn1

                        elif u'lib' not in fn1:
                            fn1 = u'lib/' + fn1

                        if fn1 not in self.files:
                            if C.DEFAULT_BREAKPOINTS:
                                logging.error(u'breakpoint!')
                                import epdb
                                epdb.st()
            if matches:
                return matches

        craws = template_data.get(u'component_raw')
        if craws is None:
            return matches

        # compare to component mapping
        matches = self._string_to_cmap_key(craws)
        if matches:
            return matches

        # do not re-process the same strings over and over again
        if craws.lower() in self.match_cache:
            return self.match_cache[craws.lower()]

        # make ngrams from largest to smallest and recheck
        blob = TextBlob(craws.lower())
        wordcount = len(blob.tokens) + 1

        for ng_size in reversed(xrange(2, wordcount)):
            ngrams = [u' '.join(x) for x in blob.ngrams(ng_size)]
            for ng in ngrams:

                matches = self._string_to_cmap_key(ng)
                if matches:
                    self.match_cache[craws.lower()] = matches
                    return matches

        # https://pypi.python.org/pypi/fuzzywuzzy
        matches = []
        for cr in craws.lower().split(u'\n'):
            ratios = []
            for k in self.CMAP.keys():
                ratio = fw_fuzz.ratio(cr, k)
                ratios.append((ratio, k))
            ratios = sorted(ratios, key=lambda tup: tup[0])
            if ratios[-1][0] >= 90:
                cnames = self.CMAP[ratios[-1][1]]
                matches += cnames
        if matches:
            self.match_cache[craws.lower()] = matches
            return matches

        # try to match to repo files
        if craws:
            clines = craws.split(u'\n')
            for craw in clines:
                cparts = craw.replace(u'-', u' ')
                cparts = cparts.split()

                for idx, x in enumerate(cparts):
                    for SC in STOPCHARS:
                        if SC in x:
                            x = x.replace(SC, u'')
                    for SW in STOPWORDS:
                        if x == SW:
                            x = u''
                    if x and u'/' not in x:
                        x = u'/' + x
                    cparts[idx] = x

                cparts = [x.strip() for x in cparts if x.strip()]

                for x in cparts:
                    for f in self.files:
                        if u'/modules/' in f:
                            continue
                        if u'test/' in f and u'test' not in craw:
                            continue
                        if u'galaxy' in f and u'galaxy' not in body:
                            continue
                        if u'dynamic inv' in body.lower(
                        ) and u'contrib' not in f:
                            continue
                        if u'inventory' in f and u'inventory' not in body.lower(
                        ):
                            continue
                        if u'contrib' in f and u'inventory' not in body.lower(
                        ):
                            continue

                        try:
                            f.endswith(x)
                        except UnicodeDecodeError:
                            continue

                        fname = os.path.basename(f).split(u'.')[0]

                        if f.endswith(x):
                            if fname.lower() in body.lower():
                                matches.append(f)
                                break
                        if f.endswith(x + u'.py'):
                            if fname.lower() in body.lower():
                                matches.append(f)
                                break
                        if f.endswith(x + u'.ps1'):
                            if fname.lower() in body.lower():
                                matches.append(f)
                                break
                        if os.path.dirname(f).endswith(x):
                            if fname.lower() in body.lower():
                                matches.append(f)
                                break

        logging.info(u'%s --> %s' % (craws, sorted(set(matches))))
        self.match_cache[craws.lower()] = matches
        return matches
Ejemplo n.º 37
0
		for link in soup.findAll('a'):
			href = link['href']
			if not href in links_detected:
				if href.startswith('http'):
					# Filter
					if url.split('/')[2] in href:
						links_detected.append(href)
					# If requested data found in url
					elif query.lower() in href.lower():
						print(Fore.GREEN + '--- Requested data found at link : ' + href)
						links_detected.append(href)
						if saveInFile:
							with open(query + ".txt", "a") as file:
								file.write(href + "\n")
					# If text in link and link location is similar
					elif fuzz.ratio(link.text, href) >= 60:
						print(Fore.GREEN + '--- Text and link are similar : ' + href)
						links_detected.append(href)
						if saveInFile:
							with open(query + ".txt", "a") as file:
								file.write(href + "\n")
	except:
		continue
	if links_detected == []:
		print(Fore.RED + '--- No data found')



	
#for s in links_detected: print(s)
Ejemplo n.º 38
0
def main():
    parser = argparse.ArgumentParser(
        prog="KeyCrypt", description="Secure Password Manager With GPG Encryption", epilog="KeyCrypt Copyright (C) 2018 Akshay R. Kapadia")
    subparsers = parser.add_subparsers(dest="command")  # Primary command
    add_subparser = subparsers.add_parser("add")
    delete_subparser = subparsers.add_parser("delete")
    edit_subparser = subparsers.add_parser("edit")
    find_subparser = subparsers.add_parser("find")
    login_subparser = subparsers.add_parser("login")
    see_subparser = subparsers.add_parser("see")
    backup_subparser = subparsers.add_parser("backup")
    restore_subparser = subparsers.add_parser("restore")
    settings_subparser = subparsers.add_parser("settings")
    nuke_subparser = subparsers.add_parser("nuke")

    # Add account parser
    add_subparser.add_argument("name", help="Name of the account", type=str)
    add_subparser.add_argument("-r", "--random-password",
                               help="Generates a random ASCII password of the specified length", type=int)

    # Delete account parser
    delete_subparser.add_argument("name", help="Name of the account", type=str)

    # Edit account parser
    edit_subparser.add_argument("name", help="Name of the account", type=str)
    edit_subparser.add_argument("-pv", "--password-visible",
                                help="Makes the password visible with the account data is shown", action="store_true")

    # Find account parser
    find_subparser.add_argument("name", help="Name of the account", type=str)
    find_subparser.add_argument("-pv", "--password-visible",
                                help="Makes the password visible with the account data is shown", action="store_true")

    # Autologin parser
    login_subparser.add_argument("name", help="Name of the account", type=str)

    # See category parser
    see_subparser.add_argument(
        "category", help="The category that you want to see", type=str)
    see_subparser.add_argument("-pv", "--password-visible",
                               help="Makes the password visible with the account data is shown", action="store_true")

    # Backup Parser
    backup_subparser.add_argument(
        "-d", "--delete", help="Deletes the original copy of the KeyCrypt data", action="store_true")
    backup_subparser.add_argument(
        "path", help="The path to the destination directory (Enter '?' to open the directory chooser", type=str)

    # Restore parser
    restore_subparser.add_argument(
        "-d", "--delete", help="Deletes the backed up copy of the KeyCrypt data", action="store_true")
    restore_subparser.add_argument(
        "-m", "--merge", help="Merges the accounts in the backup file with your current KeyCrypt", action="store_true")
    restore_subparser.add_argument(
        "path", help="The path to the directory where the backup is located (Enter '?' to open the directory chooser)", type=str)

    args = vars(parser.parse_args())

    keycrypt = KeyCrypt()
    try:
        if args["command"] == "nuke":
            confirmation = True if (str(input(colored(
                "Are You Sure You Want To Permanently Nuke The KeyCrypt (y/N): ", "red"))).lower() in ["y", "yes"]) else False
            if confirmation:
                confirmation_key = KeyCrypt.generate_password(
                    30, regenerate=False)
                typed_confirmation_key = str(
                    input(colored("Type ", "red") + colored(str(confirmation_key), "yellow") + colored(" To Nuke The KeyCrypt: ", "red")))
                if typed_confirmation_key == confirmation_key:
                    call(["shred", "-u", ".KeyCryptData.txt"])
                    print(colored("KeyCrypt Successfully Nuked", "green"))
                else:
                    print(colored("KeyCrypt Nuke Cancelled", "red"))
            else:
                print(colored("KeyCrypt Nuke Cancelled", "red"))
        else:
            keycrypt.gpg_name = str(input("Name Associated With GPG Key: ")
                                    ) if keycrypt.gpg_name is None else keycrypt.gpg_name
            if args["command"] == "backup":
                path = args["path"]
                if path == "?":
                    tk.Tk().withdraw()
                    path = askdirectory()
                if path == "":
                    raise tk.TclError
                else:
                    keycrypt.backup(path)
                    if args["delete"]:
                        call(["shred", "-u", ".KeyCryptData.txt"])
                    else:
                        keycrypt.save()
                print(colored("KeyCrypt Successfully Backed Up", "green"))
            else:
                try:
                    if args["command"] is None:
                        banner()
                        KeyCrypt.update(keycrypt)
                        for account in keycrypt.accounts:
                            account.show_account(keycrypt.wifi_permission)
                    elif args["command"] == "restore":
                        path = args["path"]
                        if path == "?":
                            tk.Tk().withdraw()
                            path = askdirectory()
                        if path == "":
                            raise tk.TclError
                        else:
                            if args["merge"]:
                                old_accounts = keycrypt.accounts
                                keycrypt = KeyCrypt(path)
                                for account_x in old_accounts:
                                    duplicate = False
                                    for account_y in keycrypt.accounts:
                                        if account_x.equals(account_y):
                                            duplicate = True
                                    if not duplicate:
                                        keycrypt.add_account(account_x)
                            else:
                                keycrypt = KeyCrypt(path)
                            if args["delete"]:
                                call(["shred", "-u", path +
                                      "/KeyCryptDataBackup.txt.gpg"])
                            print(colored("KeyCrypt Successfully Restored", "green"))
                    elif args["command"] == "settings":
                        print(colored("Settings", "red"))
                        if keycrypt.wifi_permission:
                            print("Wifi Permission (Security Status & Autologin): " +
                                  colored(keycrypt.wifi_permission, "green"))
                        else:
                            print("Wifi Permission (Security Status & Autologin): " +
                                  colored(keycrypt.wifi_permission, "red"))
                        if keycrypt.passwords_visible:
                            print("Passwords Visible: " +
                                  colored(keycrypt.passwords_visible, "green"))
                        else:
                            print("Passwords Visible: " +
                                  colored(keycrypt.passwords_visible, "red"))
                        setting = str(input("Setting: ")).lower().capitalize()
                        for defined_setting in ["Wifi Permission", "Passwords Visible"]:
                            if fuzz.partial_ratio(setting, defined_setting) >= 50:
                                setting = defined_setting
                        if setting not in ["GPG Name", "Wifi Permission", "Passwords Visible"]:
                            raise InvalidSettingError
                        if setting == "Wifi Permission":
                            keycrypt.wifi_permission = not keycrypt.wifi_permission
                            if keycrypt.wifi_permission:
                                print("Wifi Permission (Security Status & Autologin): " +
                                      colored(keycrypt.wifi_permission, "green"))
                            else:
                                print("Wifi Permission (Security Status & Autologin): " +
                                      colored(keycrypt.wifi_permission, "red"))
                        else:
                            keycrypt.passwords_visible = not keycrypt.passwords_visible
                            if keycrypt.passwords_visible:
                                print("Passwords Visible: " +
                                      colored(keycrypt.passwords_visible, "green"))
                            else:
                                print("Passwords Visible: " +
                                      colored(keycrypt.passwords_visible, "red"))
                    elif args["command"] == "add":
                        username = str(input("Username: "******"random_password"]) if args["random_password"] is not None else getpass.getpass("Password: "******"Category (Email, Web, Social, Banking, Computer, Other): ")).lower().capitalize()
                        for defined_category in ["Email", "Web", "Social", "Banking", "Computer", "Other"]:
                            if fuzz.ratio(category, defined_category) >= 70:
                                category = defined_category
                        if category not in ["Email", "Web", "Social", "Banking", "Computer", "Other"]:
                            raise InvalidCategoryError
                        url = str(
                            input("Url (Use Login Page For Autologin)(Start With 'https://'): "))
                        account = Account(args["name"], username, password, url,
                                          category, keycrypt)
                        keycrypt.add_account(account)
                        account.show_account(False)
                        print(
                            colored(args["name"] + " Account Successfully Created", "green"))
                        autologin = False if (
                            str(input("Configure Autologin (Y/n): ")) in ["n", "no"]) else True
                        if autologin:
                            if KeyCrypt.wifi_enabled(keycrypt.wifi_permission):
                                account.configure_autologin()
                            else:
                                account.autologin = False
                                raise NoInternetError
                        if (account.username_id is None or account.password_id is None):
                            account.autologin = False
                        else:
                            account.autologin = True
                    elif args["command"] == "see":
                        if keycrypt.passwords_visible:
                            args["password_visible"] = True
                        args["category"] = (
                            args["category"].lower()).capitalize()
                        for defined_category in ["Email", "Web", "Social", "Banking", "Computer", "Other", "All"]:
                            if fuzz.ratio(args["category"], defined_category) >= 70:
                                args["category"] = defined_category
                        if args["category"] not in ["Email", "Web", "Social", "Banking", "Computer", "Other", "All"]:
                            raise InvalidCategoryError
                        if args["category"] == "All":
                            KeyCrypt.update(keycrypt)
                            for account in keycrypt.accounts:
                                account.show_account(keycrypt.wifi_permission,
                                                     args["password_visible"])
                        else:
                            for account in keycrypt.accounts:
                                if account.category == args["category"]:
                                    account.update_security_status(keycrypt)
                                    account.show_account(keycrypt.wifi_permission,
                                                         args["password_visible"])
                    else:
                        account = keycrypt.find_account(args["name"])
                        if args["command"] == "delete":
                            account.show_account(False, False)
                            confirmation = True if (str(input(colored("Are You Sure You Want To Permanently Delete Your " +
                                                                      account.name + " Account (y/N): ", "red"))).lower() in ["y", "yes"]) else False
                            if confirmation:
                                confirmation_key = KeyCrypt.generate_password(
                                    15, False)
                                typed_confirmation_key = str(
                                    input(colored("Type ", "red") + colored(str(confirmation_key), "yellow") + colored(" To Delete Your " + account.name + " Account: ", "red")))
                                if typed_confirmation_key == confirmation_key:
                                    keycrypt.delete_account(account)
                                    print(colored(account.name +
                                                  " Account Deleted", "green"))
                                else:
                                    print(
                                        colored("Account Deletion Cancelled", "red"))
                            else:
                                print(colored("Account Deletion Cancelled", "red"))
                        elif args["command"] == "edit":
                            if keycrypt.passwords_visible:
                                args["password_visible"] = True
                            account.update_security_status(keycrypt)
                            account.show_account(
                                keycrypt.wifi_permission, args["password_visible"])
                            attribute = str(input("Attribute: ")
                                            ).lower().capitalize()
                            for defined_attribute in ["Name", "Username", "Password", "Url", "Category", "Autologin"]:
                                if fuzz.ratio(attribute, defined_attribute) >= 70:
                                    attribute = defined_attribute
                            if attribute not in ["Name", "Username", "Password", "Url", "Category", "Autologin"]:
                                raise InvalidAttributeError
                            account.edit_account(attribute, keycrypt)
                            account.update_security_status(keycrypt)
                            account.show_account(
                                keycrypt.wifi_permission, args["password_visible"])
                            print(
                                colored(attribute + " Successfully Edited", "green"))
                        elif args["command"] == "login":
                            if KeyCrypt.wifi_enabled(keycrypt.wifi_permission):
                                if (account.autologin and account.username_id is not None and account.password_id is not None):
                                    print(colored("Logging Into Your " +
                                                  account.name + " Account...", "red"))
                                    account.login()
                                    print(
                                        colored("Successfully Entered Login Information", "green"))
                                else:
                                    raise AccountNotConfiguredError(
                                        "Account Is Not Configured For Autologin", account)
                            else:
                                raise NoInternetError
                        elif args["command"] == "find":
                            if keycrypt.passwords_visible:
                                args["password_visible"] = True
                            for account in keycrypt.find_account(args["name"], True):
                                account.update_security_status(keycrypt)
                                account.show_account(keycrypt.wifi_permission,
                                                     args["password_visible"])
                except InvalidCategoryError:
                    print(colored("Invalid Category", "red"))
                    print(colored(
                        "Categories: Web, Social, Computer, Banking, Email, Other (, All)", "red"))
                except InvalidAttributeError:
                    print(colored("Invalid Account Attribute", "red"))
                    print(
                        colored("Attributes: Name, Username, Password, Url, Category", "red"))
                except InvalidSettingError:
                    print(colored("Invalid Setting", "red"))
                    print(
                        colored("Settings: GPG Name, Wifi Permission, Passwords Visible", "red"))
                except tk.TclError:
                    print(colored("Invalid Directory", "red"))
                except NoInternetError:
                    print(colored("No Internet Connection", "red"))
                except WebDriverException:
                    if "gecko" in str(WebDriverException):
                        print(colored("'geckodriver' Not Installed", "red"))
                    else:
                        print(colored("Incorrect Account Information", "red"))
                finally:
                    if isfile("geckodriver.log"):
                        call(["shred", "-u", "geckodriver.log"])
                    keycrypt.save()
    except FileNotFoundError:
        print(colored("File Not Found", "red"))
    except ValueError:
        print(colored("Invalid Input, Try Again", "red"))
Ejemplo n.º 39
0
    conn.row_factory = lambda cursor, row: row[0]
    c = conn.cursor()
    conn.row_factory = sqlite3.Row
    cur = conn.cursor()
    cur.execute("SELECT * FROM pokemons;")
    schema = cur.fetchone()

    names = c.execute("SELECT Nom FROM pokemons;").fetchall()
    request = ""

    if choice == "A":
        npt = input("\nEntrer le nom du pokémon\n> ")
        npt = unidecode(npt.title().strip())
        if npt not in names:
            for n in names:
                if fuzz.ratio(npt, n) > 80:
                    request = f"SELECT * FROM pokemons WHERE Nom = '{n}';"
                    break
        else:
            request = f"SELECT * FROM pokemons WHERE Nom = '{npt}';"

    else:
        npt = input("\nEntrer l'ID du pokémon\n> ")
        while len(npt) != 3:
            npt = "0" + npt
        request = f"SELECT * FROM pokemons WHERE ID = '{npt}';"
    print("\n")
    try:
        data = crsr.execute(request).fetchall()
        if data:
            for field, d in zip(schema.keys(), data[0]):
from nltk import ngrams
from fuzzywuzzy import fuzz

##### Creating required variables #####

stringlist = []
scorelist = []

##### Setting up review text and string value #####

text = "DON'T BUY THIS LAPTOP. THEY WILL SEND YOU A DEFECTIVE PIECE. I'M REGRETTING BIG TIME. I purchased this on 1st July 2020 and\
 within 15 days the keyboard just stopped working Engineer suspected hardware issue. I demanded replacement as for a fact they sent a defective\
 piece but amazon and lenovo both denied. Lenovo should be banned from our country. I regret for not saving enough money and buying another\
 brand. I regret I got carried away with the new launch. This laptop doesn't even deserve one star. Amazon has also lost my trust when\
 it comes to buying electronic items online. And lenovo can't even comment on their cheap quality product. DO NOT BUY.. "

str1 = 'keyboard stopped working'

##### Breaking text in required ngrams #####

n = 4
phrase = ngrams(text.lower().split(), n)

##### Joining n-gram tuples as strings, and then comparing with the string2  #####

for grams in phrase:
    str2 = ' '.join(grams)
    stringlist.append(str2)
    x = fuzz.ratio(str2, str1.lower())
    scorelist.append(x)
Ejemplo n.º 41
0
def ratio (a, b) :
        if contains_multiple_words(a):
                return fuzz.partial_ratio(a.lower(), b.lower())
        else :
                return fuzz.ratio(a.lower(), b.lower())
    similarities = []

    with open(args.outputname, 'w') as f:

        i = 0
        # for sentence in tqdm(sample_sentences):
        for sentence in sample_sentences:
            print(i, "/", len(sample_sentences))

            closest_sentences[i] = []

            max_so_far = -1.0
            closest_sentence = ""

            for close_sentence in full_sentence_list:
                r = float(fuzz.ratio(sentence, close_sentence)) / 100.0
                # SUPER SLOW >> r = (difflib.SequenceMatcher(None, sentence, close_sentence).ratio())
                if r >= max_so_far:
                    max_so_far = r
                    closest_sentence = close_sentence

                    closest_sentences[i].append([r, closest_sentence])
            similarities.append(max_so_far)

            ###
            """
            if max_so_far > 0.8:
                print(i)
                print("Generated sentence ",i," \"", sentence, "\" has closest:")
                print(max_so_far,":",closest_sentence)
                print("-----------------")
Ejemplo n.º 43
0
	nickname = Jackaroo 

	Jack -- a, r, o, o <- edit distance = 4 

	J,a,c,k
	J,a,c,k,a,r,o,o <- total characters = 12 
			<- 8 of 12 match 
			<- fuzzy string matching = 66.67% 
		   '''
    print('The Calculation is: {}'.format(calculation))

    print('Edit Distance is:')
    print(edit_distance(name, nickname))
    print('\n')
    print('Fuzzy Matching Percentage is: \n')
    print('{}'.format(fuzz.ratio(name, nickname)), '%')
    print('\n')

    sents = '''It was the best of times,
it was the worst of times,
it was the age of wisdom,
it was the age of foolishness,
it was the epoch of belief,
it was the epoch of incredulity,
it was the season of Light,
it was the season of Darkness,
it was the spring of hope,
it was the winter of despair,
we had everything before us,
we had nothing before us,
we were all going direct to Heaven,
Ejemplo n.º 44
0
    firstLineList = []

    ##file location where you want to place the concatenated txt files
    with open("uf" + "/" + book + ".txt", 'w') as outfile:
        for txtfile in textfiles:
            print(txtfile)
            fullName = path + "/" + book + "/" + txtfile
            headerStatus = False

            with codecs.open(fullName, "r", encoding='utf-8',
                             errors='ignore') as infile:
                lineNum = 0
                for line in infile:
                    if (lineNum == 0):
                        for firstLine in firstLineList:
                            if (fuzz.ratio(firstLine, line) > 60
                                    and len(line) > 5):
                                headerStatus = True
                                print(line)
                                break

                        firstLineList.append(line)
                    try:
                        if (lineNum == 0):
                            if (headerStatus == False):
                                outfile.write(line)
                        else:
                            outfile.write(line)
                    except UnicodeEncodeError:
                        if (lineNum == 0):
                            if (headerStatus == False):
Ejemplo n.º 45
0
def fitness(agents):
    for agent in agents:
        agent.fitness = fuzz.ratio(agent.string, in_str)

    return agents
Ejemplo n.º 46
0
 matchScore = matchScore.append(pd.DataFrame({
          'partyid1':row['PARTY_ID_1'],
          'partyid2':row['PARTY_ID_2'],
          #'full_name1':row['FULL_NAME_1'],
          #'full_name2':row['FULL_NAME_2'],
          #'nameScore':fuzz.token_set_ratio(str(row['FULL_NAME_1']).upper(),str(row['FULL_NAME_2']).upper())/100 ,
          'first_name1':row['FIRSTNAME_1'],
          'first_name2':row['FIRSTNAME_2'],
          'firstnameScore':fuzz.token_set_ratio(str(row['FIRSTNAME_1']).upper(),str(row['FIRSTNAME_2']).upper())/100 ,
         #'nameScore':sorted_levenshtein_rate(str(row['FULL_NAME_1']).upper(),str(row['FULL_NAME_2']).upper()) ,
          'last_name1':row['LASTNAME_1'],
          'last_name2':row['LASTNAME_2'],
          'lastnameScore':fuzz.token_set_ratio(str(row['LASTNAME_1']).upper(),str(row['LASTNAME_2']).upper())/100 ,
          'mobile1':row['MOBILE_1'],
          'mobile2':row['MOBILE_2'],
          'mobileScore': fuzz.ratio(str(row['MOBILE_1']),str(row['MOBILE_2']))/100, 
          'private1':row['PRIVATE_1'],
          'private2':row['PRIVATE_2'],
          'privateScore': fuzz.ratio(str(row['PRIVATE_1']),str(row['PRIVATE_2']))/100, 
          'work1':row['WORK_1'],
          'work2':row['WORK_2'],
          'workScore': fuzz.ratio(str(row['WORK_1']),str(row['WORK_2']))/100, 
          'email1':row['ELECTRONIC_ADDRESS_1'],
          'email2':row['ELECTRONIC_ADDRESS_2'],
          'emailScore': fuzz.partial_ratio(str(row['ELECTRONIC_ADDRESS_1']).upper(),str(row['ELECTRONIC_ADDRESS_2']).upper())/100, 
          'addressLine1':row['ST_ADDRESS_LINE_1'],
          'addressLine2':row['ST_ADDRESS_LINE_2'],
          
          'addressLineScore': fuzz.token_sort_ratio(str(row['ST_ADDRESS_LINE_1']).upper(),str(row['ST_ADDRESS_LINE_2']).upper())/100,
          'city1':row['ST_CITY_1'],
          'city2':row['ST_CITY_2'],
Ejemplo n.º 47
0
def process_venue(i, l, db, curId):
    inserted_venue = 0
    # Get Venue Information
    venue = {}
    venue["tm_venue_id"] = l["id"]
    venue["venue_name"] = l["name"].replace("\'", "")

    if i % 50 == 0:

        logger.info('Processing venue #%s', i + 1)
        logger.info('Venue: %s, id: %s', venue["venue_name"],
                    venue["tm_venue_id"])

    exist = []
    if "address" in l.keys():
        if bool(l["address"]):
            (dummy, venue["venue_address"]) = l['address'].popitem()
            venue["venue_address"] = format_address(venue["venue_address"])
        else:
            venue["venue_address"] = None
        if venue["venue_address"] is not None:
            exist = db.query(
                "SELECT * FROM city.venues where venue_add_comp = $1",
                venue["venue_address"]).getresult()
    else:
        exist = []
        venue["venue_address"] = None
    venue["venue_add_comp"] = venue["venue_address"]

    if exist == [] and venue["venue_name"] is not None:
        names = db.query("SELECT venue_name FROM city.venues").getresult()
        for name in names:
            n = name[0].replace("\'", "")
            if (fuzz.ratio(n, venue["venue_name"]) > 80
                    or fuzz.ratio(n, venue["venue_name"]) > 60
                    and fuzz.partial_ratio(n, venue["venue_name"]) > 90):

                exist = db.query(
                    "SELECT * FROM city.venues where venue_name = $1",
                    n).getresult()

    if exist == []:
        if (venue['venue_name'].find('TBA') > 0
                or venue['venue_name'].find('Vary By') > 0):
            venue["id"] = 2
        else:
            logger.info('INSERT VENUE: %s', venue['venue_name'])
            logger.debug('Address: %s', venue["venue_address"])
            curId = curId + 1
            venue["id"] = curId
            if "location" in l.keys() and "postalCode" in l.keys():
                logger.debug('Postal Code: %s', l["postalCode"])
                lat = l["location"].get("latitude", 0)
                lon = l["location"].get("longitude", 0)
                logger.debug('Coords: (%s, %s)', lat, lon)
                if venue["venue_address"] is not None and lat != 0 and lon != 0:
                    add = venue["venue_address"] + ", Toronto, ON "
                    add += l["postalCode"] + ", Canada"
                elif int(lat) != 0 and int(lon) != 0:
                    coord = str(lat) + ',' + str(lon)
                    try:
                        (venue["venue_add_comp"], add) = rev_geocode(coord)
                    except AddressParserException as ape:
                        logger.error(ape)
                elif venue["venue_address"] is not None:
                    (add, lat, lon) = geocode(venue["venue_address"])
                else:
                    add = None
            else:
                (add, lat, lon) = geocode(venue["venue_address"])
            venue["venue_address"] = add
            venue["lat"] = lat
            venue["lon"] = lon
            venue["capacity"] = None
            db.insert('city.venues', venue)
            inserted_venue += 1
    else:
        for venue["id"] in exist[0]:
            if type(venue["id"]) == int:
                break
    return venue, inserted_venue
Ejemplo n.º 48
0
async def _(bot: Bot, event: Event):
    try:
        item = event.get_message().__str__().strip()
        if item == "":
            await wm.send("参数不能为空!")
            return

        #物品等级分离
        item = item.replace(",", ",")
        mod_rank = 0
        if "," in item:
            mod_rank = item.split(",")[1]
            if not mod_rank.isdigit():
                await wm.send("参数错误")
            mod_rank = int(mod_rank)
            item = item.split(",")[0]

        #相似度匹配
        index = 0
        similar = 0
        for i in range(len(WF_Sale)):
            if " " in item:
                s = fuzz.ratio(item, WF_Sale[i]["zh"])
            else:
                s = fuzz.ratio(item, WF_Sale[i]["zh"].replace(" ", ""))
            if s > similar:
                similar = s
                index = i
        if similar < 50:
            await wm.send("未找到该物品,请缩小范围!")
            return

        await wm.send("正在查询 [{0}] 价格,请稍等".format(WF_Sale[index]["zh"]))
        response = requests.get(
            url="https://api.warframe.market/v1/items/{0}/orders".format(
                WF_Sale[index]["code"]))
        data = json.loads(response.text)

        #筛选并排序
        rank = []
        for i in data["payload"]["orders"]:
            if not i["visible"] or i["user"]["status"] != "ingame" or i[
                    "order_type"] != "sell" or "mod_rank" in i.keys(
                    ) and i["mod_rank"] < mod_rank:
                continue
            if not len(rank):
                rank.append(i)
                continue
            for j in range(len(rank)):
                if i["platinum"] <= rank[len(rank) - 1 - j]["platinum"]:
                    if (len(rank) - 1 - j) == 0:
                        rank.insert(len(rank) - 1 - j, i)
                        break
                    else:
                        continue
                else:
                    rank.insert(len(rank) - j, i)
                    break
            if len(rank) > 10:
                del rank[len(rank) - 1]

        nodes = []
        node = {
            "type": "node",
            "data": {
                "uin": f"{bot.self_id}",
                "name": "ZANUKA"
            }
        }
        node["data"][
            "content"] = "查价物品: {0} ({1})\n数据来源: https://warframe.market/items/{2}".format(
                WF_Sale[index]["zh"], WF_Sale[index]["en"],
                WF_Sale[index]["code"])
        nodes.append(copy.deepcopy(node))

        #简略
        is_mod = False
        content = "——————————————————————————————"
        for i in rank:
            if "mod_rank" in i.keys():
                is_mod = True
            if is_mod:
                content += "\n—单价: {0} —数量: {1} —等级: {2} —卖家: {3} —声誉: {4}".format(
                    int(i["platinum"]), i["quantity"], i["mod_rank"],
                    i["user"]["ingame_name"], i["user"]["reputation"])
            else:
                content += "\n—单价: {0} —数量: {1} —卖家: {2} —声誉: {3}".format(
                    int(i["platinum"]), i["quantity"],
                    i["user"]["ingame_name"], i["user"]["reputation"])
        content += "\n——————————————————————————————"
        node["data"]["content"] = content
        nodes.append(copy.deepcopy(node))

        #详细
        for i in rank:
            content = "——————————————————————————————"
            if is_mod:
                content += "\n—单价: {0} —数量: {1} —等级: {2}".format(
                    int(i["platinum"]), i["quantity"], i["mod_rank"])
            else:
                content += "\n—单价: {0} —数量: {1}".format(
                    int(i["platinum"]), i["quantity"])
            content += "\n—卖家: {0} —声誉: {1} —地区: {2}".format(
                i["user"]["ingame_name"], i["user"]["reputation"],
                i["user"]["region"])
            utcTime = datetime.strptime(i["creation_date"],
                                        "%Y-%m-%dT%H:%M:%S.%f+00:00")
            localTime = (
                utcTime +
                timedelta(hours=8)).strftime("%Y 年 %m 月 %d 日 %H:%M:%S")
            content += "\n—创建时间: " + localTime
            utcTime = datetime.strptime(i["last_update"],
                                        "%Y-%m-%dT%H:%M:%S.%f+00:00")
            localTime = (
                utcTime +
                timedelta(hours=8)).strftime("%Y 年 %m 月 %d 日 %H:%M:%S")
            content += "\n—上次更新: " + localTime
            utcTime = datetime.strptime(i["user"]["last_seen"],
                                        "%Y-%m-%dT%H:%M:%S.%f+00:00")
            localTime = (
                utcTime +
                timedelta(hours=8)).strftime("%Y 年 %m 月 %d 日 %H:%M:%S")
            content += "\n—上次来看: " + localTime
            content += "\n——————————————————————————————"
            if is_mod:
                content += "\n/w {0} Hi! I want to buy: {1} (rank {2}) for {3} platinum. (warframe.market)".format(
                    i["user"]["ingame_name"], WF_Sale[index]["en"],
                    i["mod_rank"], i["platinum"])
            else:
                content += "\n/w {0} Hi! I want to buy: {1} for {2} platinum. (warframe.market)".format(
                    i["user"]["ingame_name"], WF_Sale[index]["en"],
                    i["platinum"])
            node["data"]["content"] = content
            nodes.append(copy.deepcopy(node))

        await bot.send_group_forward_msg(group_id=event.group_id,
                                         messages=nodes)
    except Exception as e:
        print(e)
        await wm.finish("获取失败,请重试!")
Ejemplo n.º 49
0
import time

phrase = 'She sells seashells by the seashore'
ans = True
while ans:
    print(
        '\nInstructions: You will be shown a phrase, and you have to type it as fast and accurate as possible!'
    )
    print('\nAre you ready? (Press any key to continue)')
    ans = input('>> ')
    if ans:
        print(phrase)
        start = time.time()
        result = input('>> ')
        end = time.time()
        Ratio = fuzz.ratio(phrase, result)
        print('Your accuracy was ' + str(Ratio) + '%!')
        print('Your time was ' + "{:.2f}".format(end - start) + ' seconds!')
        ans2 = True
        while ans2:
            print('\nDo you want to try again? (Y/N)')
            ans2 = input('>> ')
            if ans2.lower() == 'n':
                ans2 = False
                ans = False
                print('Bye!')
            elif ans2.lower() == 'y':
                ans2 = False
                ans = True
            else:
                print('Invalid answer. Please try again.')
Ejemplo n.º 50
0
def fuzzy_match(word1, word2):
    return fuzz.ratio(word1, word2) / 100
Ejemplo n.º 51
0
    def handle(self, *args, **options):
        rx = Record.objects.all()
        all = rx.count()
        cnt = 0
        print "Iterating over " + str(
            all) + " database records, starting at " + str(options['start'])
        for i, r1 in enumerate(rx):
            # Obey start position argument
            if i < options['start']: continue
            for j, r2 in enumerate(rx):
                if j <= i: continue

                ratio = fuzz.ratio(r1.name, r2.name)
                if ratio < 75:
                    continue
                if r1.person_id == r2.person_id:
                    continue
                if r1.country != r2.country:
                    continue
                if r1.gender != r2.gender:
                    continue
                # Print leftovers:
                print ""
                print u"Score: {0:3d}         {1:30}{2}".format(
                    ratio, r1.name, r2.name)
                print u"Person-ID:         {1:30}{2}".format(
                    ratio, r1.person_id, r2.person_id)
                print u"Follow-up:         {0!r:<30}{1}".format(
                    r1.follow_up_case, r2.follow_up_case)
                print u"Date intervention: {0:30}{1}".format(
                    str(r1.date_intervention), str(r2.date_intervention))
                print u"Issue area:        {0:30}{1}".format(
                    r1.issue_area, r2.issue_area)
                print u"Activities:        {0:30}{1}".format(
                    r1.relevant_activities, r2.relevant_activities)
                if Record.objects.filter(pk=r1.pk,
                                         follow_ups__pk=r2.pk).exists():
                    print u"Relation exists?       ************** YES ****************"
                else:
                    print u"Relation exists?       ..............  NO ................"
                while True:
                    data = str(
                        raw_input(
                            "(a)dd, (r)emove relation, (s)kip or (p)ause: "))
                    if data.lower() not in ('a', 'r', 's', 'p'):
                        print("Not an appropriate choice.")
                    else:
                        break
                if data == "a":
                    r1.follow_ups.add(r2)
                    r1.save()
                elif data == "r":
                    r1.follow_ups.remove(r2)
                    r1.save()
                elif data == "s":
                    continue
                elif data == "p":
                    print "Restart with argument: " + str(i)
                    self.stdout.write(self.style.SUCCESS('Paused at %i' % i))
                    return
                cnt += 1
                print "Status: {:2.1f}".format((100.0 * i) / all)

        self.stdout.write(
            self.style.SUCCESS('Successfully edited all fuzzy relations'))
Ejemplo n.º 52
0
def match_check(x, checklist, fuzzy_thresh=70):
    res = any(map(lambda c: fuzz.ratio(x, c) > fuzzy_thresh, checklist))
    return res
Ejemplo n.º 53
0
def process_text(str_cmp, str_exact):
    return fuzz.ratio(str_exact, str_cmp)
Ejemplo n.º 54
0
def matching1(tagged,
              shas,
              i,
              j,
              k,
              index,
              daf,
              amud,
              strings=15,
              ratio=False):
    short = 0
    fuzzed = 0
    if len(tagged) >= 15:
        string = " ".join(tagged[0:strings])
    else:
        short += 1
        string = " ".join(tagged[0:len(tagged) - 1])
    #print string, daf, amud
    string = re.sub(ur'[\[\]\*#@[0-9]', "", string)
    found = 0
    for counter, line in enumerate(shas[index], start=1):
        if fuzz.partial_ratio(string, line) > 80:
            bingo = counter
            if ratio is True:
                if fuzz.ratio(string, line) > 60:
                    fuzzed += 1
                    found += 1
                    #print "ratio", string, daf, strings, fuzz.ratio(string, line)
            else:
                found += 1
    #if fuzzed > 0:
    #print "fuzzed", fuzzed
    if found < 1 and strings != 0:
        strings -= 1
        matching1(tagged, shas, i, j, k, index, daf, amud, strings)
        return
    elif found > 1:
        if ratio is True:
            error = "found too much, " + str(found) + "," + "on," + str(
                daf) + amud + "," + " ".join(
                    tagged[0:15]).encode('utf-8') + "\n"
            log.write(error)
            longlog.write(error)
        else:
            matching1(tagged, shas, i, j, k, index, daf, amud, strings, True)
            return

    elif found == 1:
        roash = "Rosh on %s." % masechet + str(k + 2) + "." + str(
            i + 1) + "." + str(j + 1)
        talmud = "%s." % masechet + str(daf) + amud + "." + str(bingo)
        links.append(link(talmud, roash))
        print roash, talmud
        succes = "found" + ", " + string.encode('utf-8') + str(
            daf) + amud + "," + str(strings) + "\n"
        #   print succes
        longlog.write(succes)
    elif strings == 0:
        error = "did not find on daf," + str(daf) + amud + "," + " ".join(
            tagged[0:15]).encode('utf-8') + "\n"
        log.write(error)
        longlog.write(error)
input_file_name = "t_sample_80.txt"
input_file_name2 = "smi_superset1.txt"
output_file_name = "sample_result_ver_2.txt"

# Output file
f_out = open(output_file_name, 'w')

with open(input_file_name) as t, open(input_file_name2, 'rb') as s:
    t_content = t.readlines()
    t_content = [y.strip() for y in t_content]
    s_content = pickle.load(s)
    f_out.write("ver_2.0 \n ===============\n")

for idx, i in enumerate(range(len(t_content))):
    for j in range(len(s_content)):
        if fuzz.ratio(t_content[i], s_content[j][0]) > 80:
            f_out.write("%d\n" % idx)
            f_out.write("transcript : " + t_content[i] + "\n")
            f_out.write("smi        :  " + s_content[j][0] + "\n")
            f_out.write("%d %d" % (s_content[j][1], s_content[j][2]))
            f_out.write("\n")
            f_out.write("score : "
                        "%d" % fuzz.ratio(t_content[i], s_content[j][0]) +
                        "\n")
            f_out.write("===================\n")
    # print idx
    # print "transcript :" + t_content[i]
    # print "smi        :" + s_content[t_best][0]
    # print s_content[t_best][1], "- ", s_content[t_best][2]
    # print "score", + best_score
    # print "------------"
def fuzzymatching(target, threshold):
    cell = []
    for t in target:
        if t not in cell:
            cell.append(t)

    com = sorted(cell)

    comcom = []
    similarity = []
    for i in range(len(com)):
        second = []
        third = []
        if i + 10 <= len(com):
            for j in range(i, i + 10):
                t = fuzz.ratio(com[i], com[j])
                if t > threshold:
                    second.append(com[j])
                    third.append(t)
        else:
            for j in range(i, len(com)):
                t = fuzz.ratio(com[i], com[j])
                if t > threshold:
                    second.append(com[j])
                    third.append(t)
        comcom.append(second)
        similarity.append(third)

    t = []
    newrow = []
    lst = []
    col = 0
    for i in range(len(com)):
        found_flag = False
        for row, lst in enumerate(t):
            for col, sim_company in enumerate(lst):
                if com[i] == sim_company:
                    found_flag = True
                    newrow = list(set(t[row] + comcom[i]))
                    t[row] = newrow
        if found_flag == False:
            t.append(comcom[i])
    total = filter(None, t)

    size = []
    for i in total:
        size.append(len(i))

    counts = target.value_counts()
    mail_count = counts.to_dict()
    mail_count = {k: v for k, v in mail_count.items() if k}

    tail_num = []
    for i in total:
        tt = []
        for j in i:
            t = mail_count.get(j)
            tt.append(t)
        tail_num.append(tt)
    my = {'maillist': total, 'size': tail_num, 'num': size}
    mymail = pd.DataFrame(my)

    pop_com = []
    for cell in range(len(tail_num)):
        m = tail_num[cell]
        r = max(m)
        t = m.index(r)
        n = [total[cell][t]]
        x = size[cell]
        y = n * x
        for comcell in y:
            pop_com.append(comcell)
    comcom = []
    for cell_i in total:
        for cell_j in cell_i:
            comcom.append(cell_j)

    md = {'original_com': comcom, 'clean_com': pop_com}
    myDF = pd.DataFrame(md)
    return myDF
Ejemplo n.º 57
0
def fuzzy_match(s1, s2):
    ratio = fuzz.ratio(s1, s2)
    if ratio > 95:
        return True
    return False
Ejemplo n.º 58
0
    def get_matching_node_st(query, relation_dict):
        """
        :param query: search disease in ont
        :return: tupel, where first is the similarity score and second pos. is the dict entry
        """
        cell_query = query.lower()
        clean_string = []
        for word in cell_query.split(" "):
            e = ''.join(e for e in word if e.isalnum() or e != "-")
            if e != "":
                clean_string.append(e)
        cell_query = " ".join(clean_string)
        wnl = WordNetLemmatizer()
        tokens = [token.lower() for token in word_tokenize(cell_query)]
        lemmatized_words = [wnl.lemmatize(token) for token in tokens]
        cell_query = " ".join(lemmatized_words)

        best_match = [0, "", ""]
        tissue_in_ont = ""
        if cell_query in relation_dict:
            return [100, cell_query, ""]
        else:
            for disease, value in relation_dict.items():
                # if the first char does not match continue
                try:
                    if disease[0].lower() != cell_query[0].lower() or len(
                            cell_query) < 3:
                        partial = 0
                    else:
                        partial = fuzz.ratio(disease, cell_query)
                except BaseException:
                    partial = 0
                for syn in value["hasRelatedSynonym"]:
                    try:
                        if syn[0].lower() != cell_query[0].lower() or len(
                                cell_query) < 3:
                            partial_related_syn = 0
                        else:
                            partial_related_syn = fuzz.ratio(cell_query, syn)
                    except BaseException:
                        partial_related_syn = 0
                    # print("FuzzyWuzzy Ratio: ", fuzz.ratio(tissue, text), tissue)
                    # print("FuzzyWuzzy Ratio_PARTIAL: ", partial, tissue)
                    if best_match[0] < partial:
                        if disease.lower()[0] == cell_query[
                                0] and len(cell_query) * 2 > len(disease):
                            best_match = [partial, disease, ""]
                    if best_match[0] < partial_related_syn:
                        if syn.lower()[0] == cell_query[0]:
                            best_match = [partial_related_syn, disease, syn]
                for syn in value["hasExactSynonym"]:
                    try:
                        if syn[0].lower() != cell_query[0].lower() or len(
                                cell_query) < 3:
                            partial_related_syn = 0
                        else:
                            partial_related_syn = fuzz.ratio(cell_query, syn)
                    except BaseException:
                        partial_related_syn = 0
                    # print("FuzzyWuzzy Ratio: ", fuzz.ratio(tissue, text), tissue)
                    # print("FuzzyWuzzy Ratio_PARTIAL: ", partial, tissue)
                    if best_match[0] < partial:
                        if disease.lower()[0] == cell_query[
                                0] and len(cell_query) * 2 > len(disease):
                            best_match = [partial, disease, ""]
                    if best_match[0] < partial_related_syn:
                        if syn.lower()[0] == cell_query[0]:
                            best_match = [partial_related_syn, disease, syn]
                if best_match[0] < partial:
                    if disease.lower()[0] == cell_query[0]:
                        best_match = [partial, disease, ""]
        return best_match
Ejemplo n.º 59
0
def process_call_2(path):
    a_orig = open(path, encoding="utf-8", errors='ignore').read()
    a_orig = a_orig.replace('–', '-')
    a_orig = re.split('\r|\n', a_orig)
    a = [aa.lower() for aa in a_orig]
    if 'no q&a session for this event' in ''.join(a):
        return None

    x = a.index('executives')  # 找到人名清单
    y = a.index('analysts')  # 有没有可能存在没有记录的清单?
    z = a.index('operator')
    time = ''.join(a[:x])
    anss = a[x + 1:y] + ['unidentified company representative']
    anss_short = [re.sub(' - .+', '', anss_1) for anss_1 in anss]
    askk = a[y + 1:z] + ['unidentified analyst - unidentified company'
                         ] + ['unidentified analyst']
    askk_short = [re.sub(' - .+', '', ask) for ask in askk]
    opt = ['operator']
    #     print('ans', anss)
    #     print('ask', askk)
    # 找到 QA 开始的部分
    i = 0
    while (fuzz.ratio(a[i], 'Question-and-Answer-Session'.lower()) <
           80) or (fuzz.ratio(a[i], 'Question-&-nswer-Session'.lower()) < 80):
        i += 1
    b = [x for x in a[i + 1:] if x != '']
    b_orig = [x for x in a_orig[i + 1:] if x != '']

    statu_list = []
    idx_list = []
    ques_count = 0
    for b_idx, bb in enumerate(b):
        #b_idx = 0
        ask_name_check = match_check(bb, askk)
        ans_name_check = match_check(bb, anss_short)
        opt_name_check = match_check(bb, opt)
        if ask_name_check == True:
            statu_list.append('q')
            idx_list.append(b_idx)
        elif ans_name_check == True:
            statu_list.append('a')
            idx_list.append(b_idx)
        elif opt_name_check == True:
            statu_list.append('o')
            idx_list.append(b_idx)

    if statu_list.count('a') * 2 < statu_list.count('q'):
        statu_list = []
        idx_list = []
        ques_count = 0
        for b_idx, bb in enumerate(b):
            #b_idx = 0
            ask_name_check = match_check(bb, askk)
            ans_name_check = match_check(bb, anss)
            opt_name_check = match_check(bb, opt)
            if ask_name_check == True:
                statu_list.append('q')
                idx_list.append(b_idx)
            elif ans_name_check == True:
                statu_list.append('a')
                idx_list.append(b_idx)
            elif opt_name_check == True:
                statu_list.append('o')
                idx_list.append(b_idx)

    if statu_list.count('q') * 2 < statu_list.count('a'):
        statu_list = []
        idx_list = []
        ques_count = 0
        for b_idx, bb in enumerate(b):
            #b_idx = 0
            ask_name_check = match_check(bb, askk_short)
            ans_name_check = match_check(bb, anss_short)
            opt_name_check = match_check(bb, opt)
            if ask_name_check == True:
                statu_list.append('q')
                idx_list.append(b_idx)
            elif ans_name_check == True:
                statu_list.append('a')
                idx_list.append(b_idx)
            elif opt_name_check == True:
                statu_list.append('o')
                idx_list.append(b_idx)

    if ('q' not in statu_list) and ('a' not in statu_list):
        statu_list = []
        idx_list = []
        ques_count = 0
        for b_idx, bb in enumerate(b):
            #b_idx = 0
            ask_name_check = match_check(bb, askk_short)
            ans_name_check = match_check(bb, anss)
            opt_name_check = match_check(bb, opt)
            if ask_name_check == True:
                statu_list.append('q')
                idx_list.append(b_idx)
            elif ans_name_check == True:
                statu_list.append('a')
                idx_list.append(b_idx)
            elif opt_name_check == True:
                statu_list.append('o')
                idx_list.append(b_idx)

    res_list = []
    for idx in np.arange(len(statu_list)):
        #         print(idx)
        if statu_list[idx] == 'q':  #once get a question

            if (idx - 1 >= 0) and statu_list[idx - 1] == 'q':
                pass
            else:
                ques_count += 1
                res_dict = dict()
                res_dict['question_' + str(ques_count)] = dict()
                res_dict['question_' +
                         str(ques_count)]['ask_name'] = b[idx_list[idx]]

                #q_asker = b[idx_list[idx]]

                que_start_index = idx_list[idx] + 1
                #que_end_index = idx_list[idx+1]
                idx_next_que = idx + 1
                #print('q_idxnext', idx_next_que)
                while ((idx_next_que <= len(idx_list) - 1) &
                       (statu_list[idx_next_que] == 'q')):
                    res_dict['question_' +
                             str(ques_count)]['ask_name'] += ', ' + b[
                                 idx_list[idx_next_que]]
                    idx_next_que += 1
                que_end_index = idx_list[idx_next_que]
                res_dict['question_' + str(ques_count)]['question'] = b_orig[
                    que_start_index:que_end_index]
                q_text = b[que_start_index:que_end_index]
            #res_dict['question_'+str(ques_count)] = {“”}

        if statu_list[idx] == 'a':
            assert ques_count >= 1
            if 'ans_name' in res_dict['question_' + str(ques_count)].keys():
                #res_dict['question_'+str(ques_count)] ['ans_name'] += b[idx_list[idx]]
                pass
            else:
                res_dict['question_' +
                         str(ques_count)]['ans_name'] = b[idx_list[idx]]
                ans_start_index = idx_list[idx] + 1
                #ans_end_index = idx_list[idx+1]
                idx_next_ans = idx + 1
                #print('a_idxnext', idx_next_ans)
                #print('test',statu_list[idx_next_ans])
                while ((idx_next_ans <= len(idx_list) - 1)
                       and (statu_list[idx_next_ans] == 'a')):
                    res_dict['question_' +
                             str(ques_count)]['ans_name'] += ', ' + b[
                                 idx_list[idx_next_ans]]
                    idx_next_ans += 1

                if idx_next_ans > len(idx_list) - 1:
                    ans_end_index = len(b) - 1
                else:
                    ans_end_index = idx_list[idx_next_ans]
                #                 print('question index',ans_start_index, ' ', ans_end_index)
                #                 print('question num', ques_count)
                res_dict['question_' + str(ques_count)]['answer'] = b_orig[
                    ans_start_index:ans_end_index]
                res_list.append(res_dict)

    return res_list
Ejemplo n.º 60
0
def fuzzy_ratio_similarity(str1, str2):
    return fuzz.ratio(str1, str2)