Example #1
0
def isstandup(sniptxt):
    lines = sniptxt.split("\n")
    matchst = fuzz.partial_ratio("standup", lines[0].lower()) 
    matchsy = fuzz.partial_ratio("sync", lines[0].lower()) 
    if matchst > 70 or matchsy > 70:
        return True
    return False
Example #2
0
    def fuzzy_contains(event):
        """Checks if an event exists in the database.

        Searches by name and date range in case name or date is
        slightly different.

        :event Event - The event being checked
        :return Boolean - True if event exists in database, False otherwise
        """
        time_fuzz = dt.timedelta(days=3)

        query = (Event.start < event.start + time_fuzz) & \
                (Event.start > event.start - time_fuzz)
        if event.end is not None:
            query &= (Event.end is None) | \
                ((Event.end < event.end + time_fuzz) &
                 (Event.end > event.end + time_fuzz))

        # Search for good date matches.
        date_matches = Event.query.filter(query).all()

        # Filter for good name matches.
        res = [e for e in date_matches
               if fuzz.partial_ratio(e.name, event.name) >= 70]

        # Return None if no results come up.
        if not res:
            return None

        # Get best search result from the results.
        e = max(res, key=lambda e: (fuzz.partial_ratio(e.name, event.name),
                                    fuzz.ratio(e.name, event.name)))
        return e
Example #3
0
def match_products(ingredients, limit=5):
    all_matches = []
    for ingr_dict in ingredients:
        amount = ingr_dict.get('amount', None)
        ingredient = ingr_dict.get('ingredient', None)
        try:
            stemmed_ingredient = elastic_stemmer(ingredient)
            product_ratios = []
            for product in products:
                # Fuzzy matching
                name_ratio = fuzz.partial_ratio(stemmed_ingredient, product['stemmed_name'])
                cat3_ratio = fuzz.partial_ratio(stemmed_ingredient, product['stemmed_category3'])
                # name_ratio = fuzz.partial_ratio(ingredient, product['name'])
                # cat3_ratio = fuzz.partial_ratio(ingredient, product['category3'])
                # gen_name_ratio = fuzz.partial_ratio(general_name, remove_accents(product['name']).lower())
                product_ratios.append((
                    product,
                    # [name_ratio, cat3_ratio, gen_name_ratio]
                    [name_ratio, cat3_ratio]
                ))
            # Order by sum of ratios
            ingr_matches = sorted(product_ratios, key=lambda p: sum(p[1]), reverse=True)[:limit]
            ingr_products = [match[0] for match in ingr_matches]
        except Exception as exc:
            ingr_products = []
        all_matches.append({
            'ingredient': ingredient,
            'amount': amount,
            'products': ingr_products, })
    # 2-tuple list: (ingredient_name, [matched_products])
    return all_matches
Example #4
0
def checker(url, params, headers, GET, delay, payload, positions, timeout, encoding):
    checkString = 'st4r7s' + payload + '3nd'
    if encoding:
        checkString = encoding(unquote(checkString))
    response = requester(url, replaceValue(
        params, xsschecker, checkString, copy.deepcopy), headers, GET, delay, timeout).text.lower()
    reflectedPositions = []
    for match in re.finditer('st4r7s', response):
        reflectedPositions.append(match.start())
    filledPositions = fillHoles(positions, reflectedPositions)
    #  Itretating over the reflections
    num = 0
    efficiencies = []
    for position in filledPositions:
        allEfficiencies = []
        try:
            reflected = response[reflectedPositions[num]
                :reflectedPositions[num]+len(checkString)]
            efficiency = fuzz.partial_ratio(reflected, checkString.lower())
            allEfficiencies.append(efficiency)
        except IndexError:
            pass
        if position:
            reflected = response[position:position+len(checkString)]
            if encoding:
                checkString = encoding(checkString.lower())
            efficiency = fuzz.partial_ratio(reflected, checkString)
            if reflected[:-2] == ('\\%s' % checkString.replace('st4r7s', '').replace('3nd', '')):
                efficiency = 90
            allEfficiencies.append(efficiency)
            efficiencies.append(max(allEfficiencies))
        else:
            efficiencies.append(0)
        num += 1
    return list(filter(None, efficiencies))
Example #5
0
    def get_best_match(self, term):
        response = self.call("search", {"query": term, "types": "Track"})

        # This is a hack that should be fixed
        print " + Searching for %s" % term
        try:
            artist, track = term.lower().split(" - ")
        except ValueError:
            artist, whatever, track = term.lower().split(" - ")
        
        artist = re.sub("\([^)]+\)", "", artist)
        track = re.sub("\([^)]+\)", "", track)
        
        for result in response["result"]["results"]:
            r_artist, r_track = result["artist"].lower(), result["name"].lower()
            artist_score, track_score = fuzz.partial_ratio(artist, r_artist), fuzz.partial_ratio(track, r_track)
            at_score, ta_score = fuzz.partial_ratio(artist, r_track), fuzz.partial_ratio(track, r_artist)
            
            print "%s - %s (%d/%d)" % (r_artist, r_track, artist_score, track_score)
            
            if  artist_score > 75 and track_score > 75 or at_score > 75 and ta_score > 75:
                print " + Song added:     %s - %s" % (r_artist, r_track)
                return result["key"]          

        print " + Song not found: %s" % term
        
        return None
Example #6
0
def matchGND(data):
    for n in data:
        for dnb in data[n]['dnb']:
            if dnb:
                dnb = dnb.replace('/about/rdf', '')
                resp = requests.get(dnb + '/about/lds',
                                    headers={'Accept': 'application/turtle'})
                try:
                    grp = rdflib.Graph().parse(data=resp.content, format='turtle')
                    for o in grp.objects(rdflib.term.URIRef(dnb), DC.identifier):
                        if '(OColc)' in o and o.replace('(OColc)', '') not in data[n]['oclc']:
                            newoclc = 'http://worldcat.org/oclc/' + o.replace('(OColc)', '')
                            print(newoclc)
                            data[n]['oclc'].append(newoclc)
                    doc_title = data[n]['title'][0]
                    resp_title = ''
                    title_score = title2_score = 0
                    for o in grp.objects(rdflib.term.URIRef(dnb), DC.title):
                        title_score = fuzz.partial_ratio(doc_title, o)
                        resp_title = o
                    for o in grp.objects(rdflib.term.URIRef(dnb), RDAU.P60493):
                        title2_score = fuzz.partial_ratio(doc_title, o)
                    if max(title_score, title2_score) < 80:
                        print(doc_title)
                        print(resp_title)
                except rdflib.plugins.parsers.notation3.BadSyntax:
                    if dnb == 'http://d-nb.info/gnd/':
                        data[n]['dnb'].remove(dnb)
                    print('ERROR URI: ' + n)
            return(data)
Example #7
0
    def _employees(self, company_name="", keyword=None):
        ''' Linkedin Scrape '''
        # TODO - add linkedin directory search
        ''' Linkedin Scrape'''
        args = '-inurl:"/dir/" -inurl:"/find/" -inurl:"/updates" -inurl:"/title/" -inurl:"/pulse/"'
        args = args+' -inurl:"job" -inurl:"jobs2" -inurl:"company"'
        qry = '"at {0}" {1} {2} site:linkedin.com'
        qry = qry.format(company_name, args, keyword)
        #results = Google().search(qry, 10)
        results = Google().search(qry, 1)
        results = results.dropna()
        results = Google()._google_df_to_linkedin_df(results)
        _name = '(?i){0}'.format(company_name)
        print results.columns
        if results.empty: 
            print "No employees found for", company_name, keyword
            return results

        if " " in company_name:
            results['company_score'] = [fuzz.partial_ratio(_name, company) 
                                        for company in results.company_name]
        else:
            results['company_score'] = [fuzz.ratio(_name, company) 
                                        for company in results.company_name]
        if keyword:
            results['score'] = [fuzz.partial_ratio(keyword, title) 
                                for title in results.title]
            results = results[results.score > 75]
        results = results[results.company_score > 49]
        results = results.drop_duplicates()
        return results
Example #8
0
 def sort_func(self, row_a, row_b, data=None):
     if self.b.get_object("searchentry1").get_text():
         if fuzz.partial_ratio(self.current_search, row_a.entry_name.lower()) > \
             fuzz.partial_ratio(self.current_search, row_b.entry_name.lower()):
             return False
         else:
             return True
     return False
Example #9
0
    def testIssueSeven(self):
        s1 = "HSINCHUANG"
        s2 = "SINJHUAN"
        s3 = "LSINJHUANG DISTRIC"
        s4 = "SINJHUANG DISTRICT"

        self.assertTrue(fuzz.partial_ratio(s1, s2) > 75)
        self.assertTrue(fuzz.partial_ratio(s1, s3) > 75)
        self.assertTrue(fuzz.partial_ratio(s1, s4) > 75)
def match(dh,shas,index, ratio=100):
    found_dict = {}
    list_of_found_links = []
    found_dict['index']= index
    found_dict['dh'] = dh
    found = 0
    global too_match
    global matched
    global non_match
    global count
    for line_n, line in enumerate(shas):
        if dh in line and line in dh:
            found += 1
            the_line=line_n
            list_of_found_links.append(line_n)
        elif fuzz.partial_ratio(dh,line) >= ratio and fuzz.partial_ratio(line,dh) >= ratio:
             found +=1
             the_line=line_n
             list_of_found_links.append(line_n)
        elif fuzz.partial_ratio(dh,line) >= ratio:
             found+=1
             the_line=line_n
             list_of_found_links.append(line_n)
        elif fuzz.partial_ratio(line, dh) >= ratio:
             found+=1
             the_line=line_n
             list_of_found_links.append(line_n)
    if found > 1:
        found_dict['lines']=list_of_found_links
        too_match +=1
        list_of_many_finds[index]=dh
        found_dict["more_than_one"]="TRUE"
        found_list.append(found_dict)
    if found ==1:
        found_dict['lines']=list_of_found_links
        found_dict["more_than_one"]="FALSE"
        dafamud = convert_inf_to_daf(index)
       # if len(links_list)>0:
           # previos = re.split(":",links_list[len(links_list)-1])[1]
            #if int(previos) > the_line+1:
             #   print "previos", previos, "this",  the_line+1
        link = "Rashi on %s" % masechet +" " + dafamud +":"+ str(the_line+1) + " " + dh
        links_list.append(link)
        print "found!!",dafamud,":", the_line," ", dh
        matched+=1
        found_list.append(found_dict)
    if found == 0:
        if ratio > 60:
            match(dh,shas, index, ratio-2)
        else:
            list_of_found_links.append(-1)
            found_dict['lines']=list_of_found_links
            found_dict["more_than_one"]="FALSE"
            print len(dh)
            non_match += 1
            found_list.append(found_dict)
Example #11
0
 def match(self, orig_dh, page, dh_position, ratio=85):
     partial_ratios = []
     self.found_dict[dh_position] = {}
     self.found_dict[dh_position][orig_dh] = []
     dh = self.removeEtcFromDH(orig_dh)
     found = 0
     dh_acronym_list = []
     if dh.find('"') >= 0 or dh.find("'") >= 0:
         dh_acronym_list = self.replaceAcronyms(dh)
     for line_n, para in enumerate(page):
         found_this_line = False
         para = self.removeHTMLtags(para)
         para = para.encode('utf-8')
         if dh in para:
             found += 1
             self.found_dict[dh_position][orig_dh].append((line_n, 100))
             continue
         para_pr = fuzz.partial_ratio(dh, para)
         if para_pr < 40:  # not worth checking
             continue
         elif para_pr >= ratio:
             found += 1
             self.found_dict[dh_position][orig_dh].append((line_n, para_pr))
             continue
         phrases = self.splitPara(para, len(dh))
         for phrase in phrases:
             phrase_pr = fuzz.partial_ratio(dh, phrase)
             if found_this_line == True:
                 break
             if dh in phrase:
                 found += 1
                 self.found_dict[dh_position][orig_dh].append((line_n, 100))
                 break
             elif phrase_pr >= ratio:
                 found += 1
                 self.found_dict[dh_position][orig_dh].append((line_n, phrase_pr))
                 break
             for expanded_acronym in dh_acronym_list:  # only happens if there is an acronym, found_dh refers to expanded acronym
                 acronym_pr = fuzz.partial_ratio(expanded_acronym, phrase)
                 if expanded_acronym in phrase:
                     found += 1
                     self.found_dict[dh_position][orig_dh].append((line_n, 100))
                     found_this_line = True
                     break
                 elif acronym_pr >= ratio:
                     found += 1
                     self.found_dict[dh_position][orig_dh].append((line_n, acronym_pr))
                     found_this_line = True
                     break
     if found == 0:
         if ratio > self.min_ratio:
             self.match(orig_dh, page, dh_position, ratio - self.step)
         else:
             self.non_match_file.write(orig_dh)
             self.non_match_file.write("\n")
Example #12
0
	def post_title_extract(self,sel,response):
		title = None
		title_score = 0
		slug_score = 0
		title_xpath = None
		blog=self.get_domain(response.url)
		slug = response.url.split('/')[-1] or response.url.split('/')[-2]
		slug = slug.replace('-',' ').rstrip('.html')

		head_title = sel.xpath('//title/text()').extract()
		head_title = head_title[0] if head_title else ''
		if '|' in head_title:
			pos=[head_title.split('|')[0],head_title.split('|')[-1]]
			word = pos[0] if fuzz.partial_ratio(pos[0],blog)>fuzz.partial_ratio(pos[-1],blog) else pos[-1]
			head_title_clean = head_title.replace(word,'').replace('|','')
		else:		
			head_title_clean = head_title
			text_to_remove = sel.xpath('//link[@rel="alternate"]/@title').extract()
			if text_to_remove and head_title:
				words = (' '.join(text_to_remove)+head_title).split()
				if Counter(words).most_common(3):
					for wor in Counter(words).most_common(3):
						head_title_clean = head_title_clean.replace(wor[0],'')

		[h1,h1a,h2,h2a,h3,h3a]=["//h1","//h1/a","//h2","//h2/a","//h3","//h3/a"]
		head_xpaths = [h1a,h1,h2a,h2,h3a,h3]
		title_lists = [sel.xpath(head+'//text()').extract() for head in head_xpaths]
		title_dict = OrderedDict(zip(head_xpaths,title_lists))
		for title_xpaths,title_list in title_dict.iteritems():
			if title_list:
				for titles in title_list:
					#to prevent from one word getting higher score
					if titles.count(' ')>0 or head_title_clean.count(' ')<1:
						title_ratio = fuzz.partial_token_sort_ratio(titles,head_title_clean)
						if title_ratio>title_score:
							title_score = title_ratio
							title = titles
							title_xpath = title_xpaths
							if title_score==100 and title.count(' ')>0:
								break
						#slug_ratio to be added in case
						slug_ratio = fuzz.partial_ratio(titles.lower(),slug)
						if slug_ratio>80:
							slug_score = slug_ratio
							title = titles
							title_xpath = title_xpaths
							if slug_score==100:
								break
				if slug_score==100:
					break
				if title_score==100:
					break
		if title_score<51 and slug_score<81:
			title = head_title_clean
		return title,title_xpath
def search():
    input_job_title = seniority_detection(request.args.get('job_title'), seniority_descriptors_grouped_list, acronyms_job_title_list)
    results_true_job_titles = []

    for job_title in true_job_titles_list:

        input_job_title_without_seniority = input_job_title[0]
        for sen in input_job_title[1]:
            if sen not in job_title.keys()[0]:
                input_job_title_without_seniority = input_job_title_without_seniority.replace(sen.lower(), '')
        input_job_title_without_seniority = input_job_title_without_seniority.replace(" "*2, " ").strip().lower()

        similarity_factor_value = max(similarity_factor(input_job_title[0], job_title.values()[0]),
                                      similarity_factor(input_job_title_without_seniority, job_title.values()[0])
                                      )
        results_true_job_titles.append({
                                        'Similarity factor': similarity_factor_value,
                                        'Canonical job title': job_title.keys()[0],
                                        'Seniority': input_job_title[1]
                                        })

    results_true_job_titles = sorted(results_true_job_titles, key=lambda k: k['Similarity factor'], reverse=True)[:10]

    need_change_list = False
    for num, item in enumerate(results_true_job_titles):
        if num == 0 and item['Canonical job title'].lower() in input_job_title[0] \
                    and fuzz.partial_ratio(item['Canonical job title'].lower(), input_job_title[0]) == 100 \
                    and input_job_title[0][:input_job_title[0].find(item['Canonical job title'].lower())-1].capitalize() in item['Seniority']:
            break
        elif item['Canonical job title'].lower() in input_job_title[0] \
                    and fuzz.partial_ratio(item['Canonical job title'].lower(), input_job_title[0]) == 100 \
                    and input_job_title[0][:input_job_title[0].find(item['Canonical job title'].lower())-1].capitalize() not in item['Seniority']:
            need_change_list = True
            short_input_job_title = input_job_title[0][input_job_title[0].find(item['Canonical job title'].lower()):]
            break

    if need_change_list:
        for item in results_true_job_titles:
            item['Similarity factor'] = (item['Similarity factor'] + similarity_factor(short_input_job_title, item['Canonical job title'].lower()))/2.
        results_true_job_titles = sorted(results_true_job_titles, key=lambda k: k['Similarity factor'], reverse=True)

    output_results = []
    for item in results_true_job_titles[:3]:
        seniorities = list(filter(lambda x: x not in item['Canonical job title'] and\
                                            len(filter(lambda y: y in replace_acronyms(item['Canonical job title'],
                                                                                       acronyms_job_title_list),
                                                       x.split())) == 0,
                                  item['Seniority']))
        if item['Seniority'] == [] or seniorities == []:
            output_results.append('({0}) {1}'.format(round(item['Similarity factor'], 1), item['Canonical job title']))
        else:
            output_results.append('({0}) {1}, {2}'.format(round(item['Similarity factor'], 1),
                                                          item['Canonical job title'], ', '.join(seniorities)))

    return jsonify(true_job_title=output_results)
Example #14
0
def mergeCSV(c1, c2, outfile, field_name):

    csvfile1 = file(str(c1), 'r')
    csvfile2 = file(str(c2), 'r')
    new_csvfile = file(str(outfile), 'w+')
    csv1 = csv.reader(csvfile1, delimiter=",")
    csv2 = csv.reader(csvfile2, delimiter=",")
    output = csv.writer(new_csvfile)
    csv2_rows = [row for row in csv2]

    if HEADER_SEARCH:
        print "headers in " + str(c2) + " " + str(zip(csv2_rows[0], range(0, len(csv2_rows[0]))))
        csv2_extract = int(raw_input("text above formatted ('data','index') select a header index to merge: "))

    first_pass = True
    for csv1_row in csv1:
        match = False
        for csv2_row in csv2_rows:
            output_row = csv1_row
            if first_pass:
                output_row = output_row + [str(field_name) + item for item in (csv2_row[csv2_extract:csv2_extract+MULTIMERGE])]
                match = True
                break
            elif str(csv1_row[0]) == str(csv2_row[0]):
                match = True
                for data in csv2_row[csv2_extract:csv2_extract+MULTIMERGE]:
                    output_row.append(data.split('.')[0].replace(',', ''))
        if (not match) and (FUZZY_MATCHING):
            output_rows = []
            output_names = []
            fuzzy_match = False
            for csv2_row in csv2_rows:
                output_row = []
                output_row.extend(csv1_row)
                if (fuzz.partial_ratio(str(csv1_row[0]), str(csv2_row[0])) > 95) or (fuzz.partial_ratio(str(csv2_row[0]), str(csv1_row[0])) > 95):
                    for data in csv2_row[csv2_extract:csv2_extract+MULTIMERGE]:
                        output_row.append(data.split('.')[0].replace(',', ''))
                    output_rows.append(output_row)
                    output_names.append(csv2_row[0])
                    fuzzy_match = True
            if fuzzy_match:
                print "Found fuzzy matches for {" + str(csv1_row[0]) + "}:"
                for i, name in enumerate(output_names):
                    print " (" + str(i) + ") " + name
                extraction_val = str(raw_input("Please select an index or enter 'n' for none: "))
                if extraction_val is "n":
                    output_row = csv1_row
                else:
                    match = True
                    output_row = output_rows[int(extraction_val)]
        if (not match) and (not first_pass):
            print "failed to find " + csv1_row[0] + " in second csv file"
        output.writerow(output_row)
        first_pass = False
Example #15
0
def match(dh,shas,index,dibur, ratio=100):
    found_dict = {}
    list_of_found_links = []
    found_dict['index']= index
    found_dict['dh'] = dh
    found_dict['dibbur'] = dibur
    found = 0
    global too_match
    global matched
    global non_match
    global count
    for line_n, line in enumerate(shas):
        if dh in line and line in dh:
            found += 1
            the_line=line_n
            list_of_found_links.append(line_n)
        elif fuzz.partial_ratio(dh,line) >= ratio and fuzz.partial_ratio(line,dh) >= ratio:
             found +=1
             the_line=line_n
             list_of_found_links.append(line_n)
        elif fuzz.partial_ratio(dh,line) >= ratio:
             found+=1
             the_line=line_n
             list_of_found_links.append(line_n)
        elif fuzz.partial_ratio(line, dh) >= ratio:
             found+=1
             the_line=line_n
             list_of_found_links.append(line_n)
    if found > 1:
        found_dict['lines']=list_of_found_links
        too_match +=1
        list_of_many_finds[index]=dh
        found_dict["more_than_one"]="TRUE"
        found_list.append(found_dict)
    if found ==1:
        found_dict['lines']=list_of_found_links
        found_dict["more_than_one"]="FALSE"
        dafamud = convert_inf_to_daf(index)
        link = "Rashi on %s" % masechet +" " + dafamud +":"+ str(the_line+1) + " " + dh
        add_rashi(index,dibur,the_line)
        links_list.append(link)
        print "found!!",dafamud,":", the_line," ", dh
        matched+=1
        found_list.append(found_dict)
    if found == 0:
        if ratio > min_ratio:
            match(dh,shas, index,dibur, ratio-step)
        else:
            list_of_found_links.append(-1)
            found_dict['lines']=list_of_found_links
            found_dict["more_than_one"]="FALSE"
            print len(dh)
            non_match += 1
            found_list.append(found_dict)
Example #16
0
def strict_compare_strings(string_one, string_two):
    highest_ratio = 0
    if fuzz.ratio(string_one, string_two)>highest_ratio:
        highest_ratio = fuzz.ratio(string_one, string_two)
    if fuzz.partial_ratio(string_one, string_two)>highest_ratio:
        highest_ratio = fuzz.partial_ratio(string_one, string_two)
    if fuzz.token_sort_ratio(string_one, string_two)>highest_ratio:
        highest_ratio = fuzz.token_sort_ratio(string_one, string_two)
    if fuzz.token_set_ratio(string_one, string_two)>highest_ratio:
        highest_ratio = fuzz.token_set_ratio(string_one, string_two)
    return highest_ratio
Example #17
0
def similarity(n1, n2):
    """
    Returns the mean of the partial_ratio score for each field in the two
    entities. Note that if they don't have fields that match, the score will
    be zero.
    """
    scores = [
        fuzz.partial_ratio(n1, n2),
        fuzz.partial_ratio(G.node[n1]['type'], G.node[n2]['type'])
    ]

    return float(sum(s for s in scores)) / float(len(scores))
 def compare_two_texts(self, string_a, string_b, normalize_value=True):
     """
     Compare two string and return the value of Partial Ratio algorithm
     the value is normalized between 0 and 1 values.
     """
     if ((isinstance(string_a, unicode) and isinstance(string_b, unicode)) or
             (isinstance(string_a, str) and isinstance(string_b, str))):
         if normalize_value:
             return self.__normalized_value(fuzz.partial_ratio(string_a, string_b))
         else:
             fuzz.partial_ratio(string_a, string_b)
     else:
         raise TypeError
Example #19
0
    def resolve_prp(self, s: list, people: list):
        print('\t\tTrying to resolve prepositions')
        if people:
            print('\t\t People found to resolve')
            for n in s:
                if re.match(r'(PRP.*|WP.*)', n[1]):
                    person = people[0]
                    if n[0].lower() in ['he', 'she', 'his', 'her']:
                        if not self.determiner.guessed_gender:
                            print('\t\tAssigning Gender')
                            if n[0].lower() in ['he', 'his']:
                                self.determiner.guessed_gender = ['he', 'his']
                            else:
                                self.determiner.guessed_gender = ['she', 'her']
                            print(self.determiner.guessed_gender)

                        if n[0].lower() in self.determiner.guessed_gender:
                            print('\t\tPreposition matched guessed gender')
                            if fuzz.partial_ratio(self.poi, person) > 90:
                                print('\t\t Matched preposition to poi, updating string')
                                txt = ' '.join([w[0] for w in s[:s.index(n) + 1]]).strip()
                                txt += ' (' + person + ')'
                                txt += ' ' + ' '.join([w[0] for w in s[s.index(n) + 1:]]).strip()
                                return txt
                    elif n[0].lower() in ['who', 'whom']:
                        if fuzz.partial_ratio(self.poi, person) > 90:
                            print('\t\t Matched preposition to poi, updating string')
                            txt = ' '.join([w[0] for w in s[:s.index(n) + 1]]).strip()
                            txt += ' (' + person + ')'
                            txt += ' ' + ' '.join([w[0] for w in s[s.index(n) + 1:]]).strip()
                            return txt
        else:
            print('\t\tNo people found to resolve, using default')
            for n in s:
                if re.match(r'(PRP.*|WP.*)', n[1]):
                    person = self.determiner.default
                    if n[0].lower() in ['he', 'she', 'his', 'her']:
                        if n[0].lower() in self.determiner.guessed_gender:
                            print('\t\t Matched preposition to poi, updating string')
                            txt = ' '.join([w[0] for w in s[:s.index(n) + 1]]).strip()
                            txt += ' (' + person + ')'
                            txt += ' ' + ' '.join([w[0] for w in s[s.index(n) + 1:]]).strip()
                            return txt
                    elif n[0].lower() in ['who', 'whom']:
                        print('\t\t Matched preposition to poi, updating string')
                        txt = ' '.join([w[0] for w in s[:s.index(n) + 1]]).strip()
                        txt += ' (' + person + ')'
                        txt += ' ' + ' '.join([w[0] for w in s[s.index(n) + 1:]]).strip()
                        return txt
        return None
Example #20
0
def search( word ):
    maxCost = int(len(word) * .6)
    # build first row
    currentRow = range( len(word) + 1 )
    results = {}

    # recursively search each branch of the trie
    for letter in trie.children:
        searchRecursive( trie.children[letter], letter, word, currentRow,
            results, maxCost )

    if not results.keys():
        return None

    results = results[min(results.keys())]
    if len(results) > 1:
        best = 0
        for result in results:
            ratio = fuzz.partial_ratio(word, result)
            if ratio > best:
                best_result = result
                best = ratio
    else:
        best_result = results[0]
    return best_result
Example #21
0
    def addUtter(self, utter, translations):
        output = {'utter_index': utter['utter_index']}
        
        top_hyp = ''
        if len(translations['translated']) > 0:
            top_hyp = translations['translated'][0]['hyp']

        topic = utter['segment_info']['topic']

        if utter['segment_info']['target_bio'] == 'B':
            self.frame = {}
            
        if topic in self.tagsets:
            for slot in self.tagsets[topic]:
                for value in self.tagsets[topic][slot]:
                    ratio = fuzz.partial_ratio(value, top_hyp)
                    if ratio > 80:
                        if slot not in self.frame:
                            self.frame[slot] = []
                        if value not in self.frame[slot]:
                            self.frame[slot].append(value)
            if topic == 'ATTRACTION' and 'PLACE' in self.frame and 'NEIGHBOURHOOD' in self.frame and self.frame['PLACE'] == self.frame['NEIGHBOURHOOD']:
                del self.frame['PLACE']

            output['frame_label'] = self.frame
        return output
Example #22
0
def fw_partial_ratio(question1, question2):
    fuzzy = []
    for q1, q2 in zip(question1, question2):
        partial_ratio = fuzz.partial_ratio(str(q1), str(q2)) / 100
        fuzzy.append([partial_ratio])
    print("Created fuzz partial_ratio feature")
    return np.array(fuzzy)
Example #23
0
    def addUtter(self, utter, translations):
        output = {'utter_index': utter['utter_index']}
    	transcript = utter['transcript']
        
        topic = utter['segment_info']['topic']

        if utter['segment_info']['target_bio'] == 'B':
            self.frame = {}
            
        if topic in self.translated_tagsets:
            for slot in self.translated_tagsets[topic]:
                for value_obj in self.translated_tagsets[topic][slot]:
                    entry_en = value_obj['entry_en']
                    if len(value_obj['translated_cn']) > 0:
                        top_hyp = value_obj['translated_cn'][0]
                    
                        ratio = fuzz.partial_ratio(top_hyp, transcript)
                        if ratio > 80:
                            if slot not in self.frame:
                                self.frame[slot] = []
                            if entry_en not in self.frame[slot]:
                                self.frame[slot].append(entry_en)
            if topic == 'ATTRACTION' and 'PLACE' in self.frame and 'NEIGHBOURHOOD' in self.frame and self.frame['PLACE'] == self.frame['NEIGHBOURHOOD']:
                del self.frame['PLACE']

            output['frame_label'] = self.frame
        return output
def fuzzy(products_name_set, listings):
    """
    The function that uses Levenstein distance to determine matching pairs of
    products and listings
    :param products_name_set: Indexed product names(For faster matching)
    :param listings: Listings to be matched
    :return: A dictionary containg the matched product with all its listings
    """
    final_products = defaultdict(list)
    for listing in listings:
        possible_products = set()
        for product_name in products_name_set:
             token_set_ratio = fuzz.token_set_ratio(listing["new_title"], product_name)
             if token_set_ratio is 100:
                possible_products.add(product_name)

        #More than one possible product found
        if len(possible_products) > 1:
            for possible_product in possible_products:
                partial_ratio = fuzz.partial_ratio(listing["new_title"], possible_product)
                if partial_ratio is 100:
                    final_products[possible_product].append(listing)
        else:
            for possible_product in possible_products:
                final_products[possible_product].append(listing)
    return final_products
Example #25
0
def getRatio(var1, var2, alg):

    r1test = 40
    r2test = 100
    r3test = 100
    r4test = 90 # 85 is probably too low --- too many FP
    
    # let's keep alg as a dummy, but it may be unimportant
    # it seems that the quality of results can be improved if two (or)
    # -- more results are correlated: [1] can be lowered as long as [4] remains high
    
    r1 = fuzz.ratio(var1,var2)
    r2 = fuzz.partial_ratio(var1,var2)
    r3 = fuzz.token_sort_ratio(var1,var2)
    r4 = fuzz.token_set_ratio(var1,var2)

    if r1 >= r1test:
        if r4 >= r4test:
            ratio = 100
            #reportRatio(var1, var2)
        else:
            ratio = 0
    else:
        ratio = 0

    return(ratio)
def scorePage(page, title, year):
	defaultBonus = 10
	bonuses = []
	bonusToScore = {'filmInTitle': defaultBonus, 'filmInSummary': 20, 'yearInTitle': defaultBonus, 'yearInSummary': 20}
	pageScore = 0
	# TODO: consider how (film) or year in the page.title could negatively affect fuzz.ratio. Consider replacing '(film)' with ''.
	pageScore = max( fuzz.partial_ratio(title, page.summary), fuzz.ratio(title, page.title))

	if 'film' in page.title:
		bonuses.append('filmInTitle')
		pageScore += bonusToScore['filmInTitle']

	if 'film' in page.summary:
		bonuses.append('filmInSummary')
		pageScore += bonusToScore['filmInSummary']

	if str(year) in page.title:
		bonuses.append('yearInTitle')
		pageScore += bonusToScore['yearInTitle']

	if str(year) in page.summary:
		bonuses.append('yearInSummary')
		pageScore += bonusToScore['yearInSummary']

	pageConfidence = pageScore / (100 + sum(bonusToScore.values()))

	safePrint('\t\tPage Title-> ' + page.title)
	print('\t\tPage Bonus-> ' + ','.join(bonuses))
	safePrint('\t\tConfidence-> ' + str(pageConfidence))
	return pageConfidence
Example #27
0
def cycle(source, header, new_folder):
    folder_path = os.path.expanduser('~') + "/Desktop/" + source + "/"
    file_names = seek(folder_path)
    save_pathway = direct(new_folder)
    problem_pathway = direct("Problem/")

    for x in range(0, len(file_names)):
        print('Cycle start')
        # Catches errors with the try, except structure.
        try:
            filename = os.path.splitext(file_names[x])[0]

            case = None
            issues = ['Summa', 'VCU']
            for key in issues:
                if fuzz.partial_ratio(filename, key) > 85:
                    case = "odd_header"

            raw_data, date_mode = read_txt_file("%s%s" % (folder_path, file_names[x]))
            col = order(general_parse(simplify(header, colify(raw_data, case)), header, date_mode), header)
            row = rowify(col)
            convert(row, filename, save_pathway)
        except:
            # Move file to the problem folder.
            shutil.copy("%s%s" % (folder_path, file_names[x]), "%s%s" % (problem_pathway, file_names[x]))

    for file in file_names:
        os.remove("%s%s" % (folder_path, file))
Example #28
0
    def _employees(self, company_name="", keyword=""):
        ''' Linkedin Scrape '''
        # TODO - add linkedin directory search
        ''' Linkedin Scrape'''
        args = '-inurl:"/dir/" -inurl:"/find/" -inurl:"/updates"'
        args = args+' -inurl:"job" -inurl:"jobs2" -inurl:"company"'
        qry = '"at {0}" {1} {2} site:linkedin.com'
        qry = qry.format(company_name, args, keyword)
        results = Google().search(qry, 10)
        results = results.dropna()
        results = Google()._google_df_to_linkedin_df(results)
        _name = '(?i){0}'.format(company_name)
        if " " in company_name:
            results['company_score'] = [fuzz.partial_ratio(_name, company) 
                                        for company in results.company]
        else:
            results['company_score'] = [fuzz.ratio(_name, company) 
                                        for company in results.company]
        if keyword != "":
            results['score'] = [fuzz.ratio(keyword, title) 
                                for title in results.title]
            results = results[results.score > 75]

        results = results[results.company_score > 64]
        results = results.drop_duplicates()
        data = {'data': results.to_dict('r'), 'company_name':company_name}
        CompanyExtraInfoCrawl()._persist(data, "employees", "")

        job = rq.get_current_job()
        print job.meta.keys()
        if "queue_name" in job.meta.keys():
          if RQueue()._has_completed(job.meta["queue_name"]):
            q.enqueue(Jigsaw()._upload_csv, job.meta["company_name"])
        return results
Example #29
0
 def getId(self, title):
     apiArgs = {'api_key' : self.api_key, 'query' : title}
     query = API_URL + self.api_search + "?" + urlencode(apiArgs)
     apiRequest = Request(query, headers=HEADERS)
     result = urlopen(apiRequest).read()
     data = json.loads(result)
             
     movieId = None
     found = {}
     alt = {}
     
     for i in data['results']:
         if i is None:
             continue
         
         if fuzz.token_sort_ratio(title, i[self.title]) == 100:
             movieId = str(i['id'])
             found[movieId] = {'title' : i[self.title], 'date' : i[self.date]}
         elif fuzz.token_sort_ratio(title, i[self.title]) > 85 and fuzz.partial_ratio(title, i[self.title]) > 90:
             altId = str(i['id'])
             alt[altId] = {'title' : i[self.title], 'date' : i[self.date]}
     
     if len(found) == 1:
         return movieId
     elif len(found) > 1:
         print "DUPLICATES FOUND, ENTER THE ID OR -1 TO SKIP"
         movieId = self.movieSelect(found)
     elif len(alt) > 0:
         print "ALTERNATES FOUND, ENTER THE ID OR -1 TO SKIP"
         movieId = self.movieSelect(alt)
     
     return movieId
Example #30
0
def parselocation(htmldoc):
    location_path = base_xpath + '''/div[@class='html5-section body']/div[@class='polizeimeldung'][2]'''
    elem_location = htmldoc.xpath(location_path)
    if elem_location[0].text is not None:
        location = elem_location[0].text
    else:
        return None, ''

    matches = []
    boundaries = []
    district_descriptions = ''
    from fuzzywuzzy import fuzz

    for district in districts:
        ratio = fuzz.partial_ratio(district.name, location)
        if ratio >= 90:
            matches.append(district)
            district.geometry.transform(4326)
            boundaries.append(district.geometry)
            district_descriptions += district.name

    if len(boundaries) == 0:
        return None, ''
    boundary = boundaries[0]
    for district_boundary in boundaries[1:]:
        boundary.union(district_boundary)
    return boundary, district_descriptions
Example #31
0
 def rankNotes(self):
     self.note_counter = Counter()
     for note in self.patient_data['Notes']:
         for i in range(len(self.top)):
             self.note_counter[note] += fuzz.partial_ratio(
                 self.top[i][0], note) * (len(self.top) - i)
 def get_kind(kind):
     kind_list = ["生之花", "死之羽", "时之沙", "空之杯", "理之冠"]
     for set_kind in kind_list:
         if partial_ratio(set_kind, kind) > 80:
             return set_kind
Example #33
0
 def testEmptyStringsScore0(self):
     self.assertEqual(fuzz.ratio("", ""), 0)
     self.assertEqual(fuzz.partial_ratio("", ""), 0)
Example #34
0
 def testPartialRatio(self):
     self.assertEqual(fuzz.partial_ratio(self.s1, self.s3), 100)
Example #35
0
def predict_chip_dict(wdir, input_pattern_str, bamExt, fromBam=None):
    """
    Predict a chip_dict from set of bam files
    ChIP input/control samples are identified from input_pattern (default: 'input')
    for each sample then the best input sample (by fuzzywuzzy score) is selected
    chip_dict is written as yaml to workflow workingdir
    predicts whether a sample is broad or narrow based on histone mark pattern
    """
    pat = "|".join(re.split(',| |\\||;', input_pattern_str))
    input_pat = r".*(" + pat + ")"
    clean_pat = r"" + pat + ""
    pat1 = re.compile(clean_pat, re.IGNORECASE)

    if fromBam:
        infiles = sorted(glob.glob(os.path.join(fromBam, '*' + bamExt)))
    else:
        infiles = sorted(
            glob.glob(os.path.join(wdir, 'filtered_bam/', '*.bam')))
    samples = get_sample_names_bam(infiles, bamExt)

    chip_dict_pred = {}
    chip_dict_pred["chip_dict"] = {}
    print(
        "---------------------------------------------------------------------------------------"
    )
    print("Predict Chip-seq sample configuration")
    print(
        "---------------------------------------------------------------------------------------"
    )
    print("\nSearch for Input/control samples...")

    input_samples = set([])
    for i in samples:
        if re.match(input_pat, i, re.IGNORECASE):
            print("...found: ", i)
            input_samples.add(i)

    print("\nTry to find corresponding ChIP samples...")

    for i in samples:
        if i in input_samples:
            continue

        print(
            "\n sample: ",
            i,
        )
        matches_sim = {}
        for j in input_samples:
            c_clean = pat1.sub("", j)
            sim1 = fuzz.ratio(c_clean, i) + fuzz.partial_ratio(
                c_clean, i) + fuzz.token_sort_ratio(
                    c_clean, i) + fuzz.token_set_ratio(c_clean, i)
            matches_sim[j] = sim1 / 4

        sim = 0
        final_matches = set([])
        for key, value in sorted(matches_sim.items(),
                                 key=lambda k: (k[1], k[0]),
                                 reverse=True):
            if value >= sim:
                final_matches.add(key)
                print("   top matching input sample by score: %s = %s" %
                      (key, value))
                sim = value

        tmp = ':'.join(list(final_matches))

        if len(final_matches) > 1:
            tmp = "__PLEASE_SELECT_ONLY_ONE_CONTROL__:" + tmp
        elif len(final_matches) == 0:
            print("No control sample found!")

        chip_dict_pred["chip_dict"][i] = {}
        chip_dict_pred["chip_dict"][i]['control'] = tmp
        if re.match(".*(H3K4me1|H3K36me3|H3K9me3|H3K27me3).*", i,
                    re.IGNORECASE):
            chip_dict_pred["chip_dict"][i]['broad'] = True
        else:
            chip_dict_pred["chip_dict"][i]['broad'] = False

    outfile = os.path.join(wdir, "chip_seq_sample_config.PREDICTED.yaml")
    write_configfile(outfile, chip_dict_pred)
    print(
        "---------------------------------------------------------------------------------------"
    )
    print("Chip-seq sample configuration is written to file ", outfile)
    print(
        "Please check and modify this file - this is just a guess! Then run the workflow with it."
    )
    print(
        "---------------------------------------------------------------------------------------"
    )
Example #36
0
def get_data(url, interest):
    url = str(url)
    interest = str(interest)
    print interest
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')

    details = []
    contact_details = []
    names = []
    research = []
    emails = []
    links = []

    for id in soup.find_all("a", href=True):
        if "./" in id['href']:
            if "Dr." in id.text:
                links.append("http://ee.iitd.ernet.in/people" +
                             id['href'].replace("./", "/"))
                names.append(id.text.replace("\n", ""))

    for name in soup.find_all("td", {'width': '70%'}):
        details.append(name.text)
        tmp = name.text
        a = tmp.find("Area")
        p = tmp.find("Phone")
        e = tmp.find("Email")
        research.append(tmp[a + 5:])
        contact_details.append(tmp[p:e].replace("\n", ""))
        emails.append(tmp[e:a - 9].replace("\n", ""))

    research.pop(0)
    emails.pop(0)
    contact_details.pop(0)

    fields = [
    ]  # 2-d array which consists all research areas in a list on that index!
    pos = 0
    i = 0
    for data in research:
        pos = 0
        fields.append([])
        j = 0
        x = 0
        for a in data:
            if a == '(':
                x += 1
            if a == ')':
                x -= 1
            if a == ',':
                if x == 0:
                    fields[i].append(data[pos:j])
                    if data[j + 1] == " ":
                        pos = j + 2
                    else:
                        pos = j + 1
            if j == len(data) - 1:
                fields[i].append(data[pos:j])
            j += 1
        i += 1

    list = []
    for i in xrange(0, len(names)):
        list.append([])
        list[i].append(names[i].strip())
        list[i].append(emails[i].strip())
        list[i].append(links[i].strip())
        list[i].append(contact_details[i].strip())
        list[i].append(fields[i])

    i = 0
    k = 0
    final_list = []
    for x in list:
        count = 0
        for y in x[4]:
            rat = fuzz.partial_ratio(interest.lower(), y.lower())
            if count == 1:
                break
            if rat >= 80:
                final_list.append({})
                final_list[k]['name'] = names[i]
                final_list[k]['email'] = emails[i]
                final_list[k]['link'] = links[i]
                final_list[k]['contact_detail'] = contact_details[i]
                final_list[k]['field'] = fields[i]
                count += 1
                k = k + 1
        i += 1
    pprint(final_list)
    return final_list
Example #37
0
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Simple Ratio
r = fuzz.ratio("this is a test", "this is a test!")
print(r)
# Partial Ratio
r = fuzz.partial_ratio("this is a test", "this is a test!")
print(r)
# Token Sort Ratio
r1 = fuzz.ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
r2 = fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
print(r1, r2)
# Token Set Ratio
r1 = fuzz.token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
r2 = fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
print(r1, r2)
Example #38
0
def WRatio(s1, s2, force_ascii=True):
    """
    Return a measure of the sequences' similarity between 0 and 100, using different algorithms.

    **Steps in the order they occur**

    #. Run full_process from utils on both strings
    #. Short circuit if this makes either string empty
    #. Take the ratio of the two processed strings (fuzz.ratio)
    #. Run checks to compare the length of the strings
        * If one of the strings is more than 1.5 times as long as the other
          use partial_ratio comparisons - scale partial results by 0.9
          (this makes sure only full results can return 100)
        * If one of the strings is over 8 times as long as the other
          instead scale by 0.6

    #. Run the other ratio functions
        * if using partial ratio functions call partial_ratio,
          partial_token_sort_ratio and partial_token_set_ratio
          scale all of these by the ratio based on length
        * otherwise call token_sort_ratio and token_set_ratio
        * all token based comparisons are scaled by 0.95
          (on top of any partial scalars)

    #. Take the highest value from these results
       round it and return it as an integer.

    :param s1:
    :param s2:
    :param force_ascii: Allow only ascii characters
    :type force_ascii: bool
    :return:
    """

    p1 = utils.full_process(s1, force_ascii=force_ascii)
    p2 = utils.full_process(s2, force_ascii=force_ascii)

    if not utils.validate_string(p1):
        return 0
    if not utils.validate_string(p2):
        return 0

    # should we look at partials?
    try_partial = True

    unbase_scale = .60
    partial_scale = .90

    base = fuzz.ratio(p1, p2)
    len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2))

    # if strings are similar length, don't use partials
    if abs(len(p2) - len(p1)) <= 1:
        try_partial = True
        partial_scale = 0.95
        unbase_scale = 0.65

    if abs(len(p2) - len(p1)) <= 2 and max(len(p2), len(p1)) > 6:
        try_partial = False

    if abs(len(p2) - len(p1)) >= 3 and max(len(p2), len(p1)) > 6:
        try_partial = True
        partial_scale = 0.85

    if len_ratio > 2:
        try_partial = True
        partial_scale = 0.65

    # if one string is much much shorter than the other
    if len_ratio > 8:
        partial_scale = .60

    if try_partial:
        partial = fuzz.partial_ratio(p1, p2) * partial_scale
        ptsor = fuzz.partial_token_sort_ratio(p1, p2, full_process=False) \
            * unbase_scale * partial_scale
        ptser = fuzz.partial_token_set_ratio(p1, p2, full_process=False) \
            * unbase_scale * partial_scale

        return utils.intr(max(base, partial, ptsor, ptser))
    else:
        tsor = fuzz.token_sort_ratio(p1, p2, full_process=False) * unbase_scale
        tser = fuzz.token_set_ratio(p1, p2, full_process=False) * unbase_scale

        return utils.intr(max(base, tsor, tser))
Example #39
0
 def testPartialRatioUnicodeString(self):
     s1 = "\u00C1"
     s2 = "ABCD"
     score = fuzz.partial_ratio(s1, s2)
     self.assertEqual(0, score)
Example #40
0
def fuzzycheck(string1, string2):
    PartialRatio = fuzz.partial_ratio(string1, string2)
    if PartialRatio > 90:
        return True
    else:
        return False
Example #41
0
def similar(a, b):
    return fuzz.partial_ratio(a, b)
import requests
from BeautifulSoup import BeautifulSoup
import re
import webbrowser
import os

url = 'https://en.wikipedia.org/w/api.php?action=query&list=random&rnnamespace=0&rnlimit=10&format=xml'

response = requests.get(url)
html = response.content
soup = BeautifulSoup(html)

ids = []
for item in soup.findAll(id=re.compile("[0-9]")):
    ids.append(item['id'])

titles = []
for name in soup.findAll(title=re.compile(".*")):
    titles.append(name['title'])

print('')
for i in range(len(titles)):
    ans = raw_input("Do you want to read about " + titles[i] + "?")
    if fuzz.partial_ratio(ans, "Yes") > 50:
        soup = BeautifulSoup(
            requests.get('https://en.wikipedia.org/wiki?curid=' +
                         ids[i]).content)
        result = soup.find("div", {"id": "mw-content-text"}).find('p').text
        print(result)
        os.system('P:\Python\Projects\wiki_v2.py')
Example #43
0
				if not oldAttrname[0] in outputData2[key].keys():
					outputData2[key][oldAttrname[0]] = 0
				outputData2[key][oldAttrname[0]] += 1
			else:
				if not key in outputData2[key].keys():
					outputData2[key][key] = 0
				outputData2[key][key] += 1


for key, value in outputData2.items():
	
	keyList= list(value.keys())

	for k2 in keyList:
		if outputData2[key][k2] < 3:
			score2 = fuzz.partial_ratio(key, k2)
			if score2  < 65:
				if not key in keySimInv[k2].keys():
					print(f"Elimino {k2} da {key}")
					del outputData2[key][k2]
			else:
				print(key, k2, score2)

CommonUtilities.writeDictToJson(outputData2, f"{PHASE_3_SOURCE_DIR}/big_clusterkey_2.json")



###Passo 3 Calcolo lo score | Per ogni chiave confronto la lista delle chiavi valore con tutte le altre
###Score calcolato come % di inclusione dell 'insieme piu piccolo
keylist = list(outputData2.keys())
for k1 in keylist:
data['len_char_q1'] = data.question1.apply(
    lambda x: len(''.join(set(str(x).replace(' ', '')))))
data['len_char_q2'] = data.question2.apply(
    lambda x: len(''.join(set(str(x).replace(' ', '')))))
data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split()))
data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split()))
data['common_words'] = data.apply(lambda x: len(
    set(str(x['question1']).lower().split()).intersection(
        set(str(x['question2']).lower().split()))),
                                  axis=1)
data['fuzz_qratio'] = data.apply(
    lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_WRatio'] = data.apply(
    lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_partial_ratio'] = data.apply(
    lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])),
    axis=1)
data['fuzz_partial_token_set_ratio'] = data.apply(
    lambda x: fuzz.partial_token_set_ratio(str(x['question1']),
                                           str(x['question2'])),
    axis=1)
data['fuzz_partial_token_sort_ratio'] = data.apply(
    lambda x: fuzz.partial_token_sort_ratio(str(x['question1']),
                                            str(x['question2'])),
    axis=1)
data['fuzz_token_set_ratio'] = data.apply(
    lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])),
    axis=1)
data['fuzz_token_sort_ratio'] = data.apply(
    lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])),
    axis=1)
Example #45
0
    def create_new_features(self):
        # normal custom features

        self.data["question1"] = self.data["question1"].fillna("").apply(
            self.question_preprocess)
        self.data["question2"] = self.data["question2"].fillna("").apply(
            self.question_preprocess)

        self.data['freq_qid1'] = self.data.groupby(
            ['qid1'])['qid1'].transform('count')
        self.data['freq_qid2'] = self.data.groupby('qid2')['qid2'].transform(
            'count')

        self.data['q1len'] = self.data['question1'].str.len()
        self.data['q2len'] = self.data['question1'].str.len()

        self.data['q1_n_words'] = self.data['question1'].apply(
            lambda row: len(row.split(" ")))
        self.data['q2_n_words'] = self.data['question2'].apply(
            lambda row: len(row.split(" ")))

        self.data['word_Common'] = self.data.apply(self.normalized_word_Common,
                                                   axis=1)
        self.data['word_share'] = self.data.apply(self.normalized_word_share,
                                                  axis=1)

        self.data[
            'word_Total'] = self.data['q1_n_words'] + self.data['q2_n_words']

        self.data[
            'freq_q1+q2'] = self.data['freq_qid1'] + self.data['freq_qid2']
        self.data['freq_q1-q2'] = abs(self.data['freq_qid1'] -
                                      self.data['freq_qid2'])

        # advanced features
        self.token_features = self.data.apply(
            lambda x: self.get_token_features(x["question1"], x["question2"]),
            axis=1)

        self.data["cwc_min"] = list(map(lambda x: x[0], self.token_features))
        self.data["cwc_max"] = list(map(lambda x: x[1], self.token_features))
        self.data["csc_min"] = list(map(lambda x: x[2], self.token_features))
        self.data["csc_max"] = list(map(lambda x: x[3], self.token_features))
        self.data["ctc_min"] = list(map(lambda x: x[4], self.token_features))
        self.data["ctc_max"] = list(map(lambda x: x[5], self.token_features))
        self.data["last_word_eq"] = list(
            map(lambda x: x[6], self.token_features))
        self.data["first_word_eq"] = list(
            map(lambda x: x[7], self.token_features))
        self.data["abs_len_diff"] = list(
            map(lambda x: x[8], self.token_features))
        self.data["mean_len"] = list(map(lambda x: x[9], self.token_features))

        self.data["token_set_ratio"] = self.data.apply(
            lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]),
            axis=1)
        self.data["token_sort_ratio"] = self.data.apply(
            lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]),
            axis=1)
        self.data["fuzz_ratio"] = self.data.apply(
            lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
        self.data["fuzz_partial_ratio"] = self.data.apply(
            lambda x: fuzz.partial_ratio(x["question1"], x["question2"]),
            axis=1)

        self.data["longest_substr_ratio"] = self.data.apply(
            lambda x: self.get_longest_substr_ratio(x["question1"], x[
                "question2"]),
            axis=1)

        return self.data
                splitted_attribute_where = i.split(" ")
                for e in splitted_attribute_where:
                    if e in filtered_sentence_where:
                        filtered_sentence_where.remove(e)
        if Where_attribute_exist == True:
            continue

    words_matching_dic_where = {}

    att_Dict_where = {}
    for n in DF_att:  ## iterating on tables names
        #print(n)
        #print(DF_att[n])
        for word1, word2 in product(filtered_sentence_where, DF_att[n]):
            Ratio = fuzz.ratio(word1.lower(), word2.lower())
            Partial_Ratio = fuzz.partial_ratio(word1.lower(), word2.lower())
            Token_Sort_Ratio = fuzz.token_sort_ratio(word1, word2)
            Token_Set_Ratio = fuzz.token_set_ratio(word1, word2)
            if Ratio >= 65:
                print(word1 + " matches with attribute  " + word2 +
                      " with ratio " + str(Ratio) + " in the where clause ")
                list_attributes_matched.append(word2)
                ID_list = [" ID ", "_ID"]
                ID_conflict = False
                for d in ID_list:
                    if d in word2 and d not in word1:
                        ID_conflict = True
                    elif d in word1 and d in word2:
                        mapping_Dic["Where"].append(word2)
                    # Df_name.append(n)
Example #47
0
def imageanalise(id_gen,source_pdfs,folders):
	from fuzzywuzzy import fuzz
	from fuzzywuzzy import process
	# import glob
	import pyocr
	import pyocr.builders
	import re
	from PIL import Image
	from os import system

	# Creating a report file id_source,spath,pages,id_gen,destination,types,docid,outcome
	GenerateReport('ID Source','Source Path','Pages','ID Destination','Destination','Type','Doc ID','Outcome','', True)
	id_gen = int(id_gen)

	# Verify subfolder in main folder
	for i,folder in folders:
		vals = pathmapping(folder,'croped/croped_val1_*.jpg',False,True)
		pdf_pages_number = []
		
		# Check for validation images inside folder
		for numpage,val in reversed(list(vals)):
			print(val)
			green_grade = 0
			im = Image.open(val)

			jpg = val.replace('croped/croped_val1_','')

			# Saving PDF pages
			pdf_pages_number.append(numpage)
			
			# Check for green grade in image
			for pixel in im.getdata():
				if (pixel[1]>(pixel[2]+10) and pixel[1]>(pixel[0]+10)):
					green_grade += 1
			
			# Check text inside main area of analises
			if (green_grade >=200):

				# Build txt image in order to be analised
				cropimage(folder,jpg,100,120,700,270,'croped_txt1_')

				jpg_text = val.replace('val1','txt1')

				# Convert image into text mode
				tools = pyocr.get_available_tools()[0]
				text_txt1 = tools.image_to_string(Image.open(jpg_text), builder=pyocr.builders.DigitBuilder())

				if fuzz.partial_ratio('INSTALACAO', text_txt1) > 70:
					id_gen = GenerateDoc(id_gen,source_pdfs,'LI',folder,jpg,280,400,715,475,pdf_pages_number,None)
					print('\n ============ DOCUMENT FOUND (LI) =========== \n')
				elif fuzz.partial_ratio('OPERACAO', text_txt1) > 70:
					id_gen = GenerateDoc(id_gen,source_pdfs,'LO',folder,jpg,280,400,715,475,pdf_pages_number,None)
					print('\n ============ DOCUMENT FOUND (LO) =========== \n')
				elif fuzz.partial_ratio('PREVIA', text_txt1) > 70:
					id_gen = GenerateDoc(id_gen,source_pdfs,'LP',folder,jpg,280,400,715,475,pdf_pages_number,None)
					print('\n ============ DOCUMENT FOUND (LP) =========== \n')
				elif fuzz.partial_ratio('LOCALIZACAO', text_txt1) > 70:
					id_gen = GenerateDoc(id_gen,source_pdfs,'LL',folder,jpg,100,410,715,475,pdf_pages_number,None)
					print('\n ============ DOCUMENT FOUND (LL) =========== \n')
				else:
					id_gen = GenerateDoc(id_gen,source_pdfs,'ERROR',folder,jpg,350,410,715,475,pdf_pages_number,None)
					print('\n ============ DOCUMENT FOUND (NL) =========== \n')
			else:
				jpg_text2 = val.replace('val1','txt2')

				tools = pyocr.get_available_tools()[0]
				text_txt2 = tools.image_to_string(Image.open(jpg_text2), builder=pyocr.builders.DigitBuilder())

				if fuzz.partial_ratio('Sistema de Tratamento de Efluentes', text_txt2) > 70:
					id_gen = GenerateDoc(id_gen,source_pdfs,'STE',folder,jpg,380,60,620,130,pdf_pages_number,None)
					print('\n ============ DOCUMENT FOUND (STE) =========== \n')
				else:
					jpg_text3 = val.replace('val1','txt3')

					tools = pyocr.get_available_tools()[0]
					text_txt3 = tools.image_to_string(Image.open(jpg_text3), builder=pyocr.builders.DigitBuilder())
					startnum = val.rfind('_')
					endnum = val.rfind('.')
					doc_num = re.findall(r'r\d+/\d+|$', text_txt3)
					doc_num = ''.join(doc_num[0])
					doc_num = doc_num.replace('/','.')
					if doc_num == '':
						doc_num = str(id_gen)

					if fuzz.partial_ratio('LICENGA ESPECIAL', text_txt3) > 70:
						id_gen = GenerateDoc(id_gen,source_pdfs,'LE',None,jpg,0,0,0,0,pdf_pages_number,doc_num)
						print('\n ============ DOCUMENT FOUND (LE) =========== \n')
					elif int(val[startnum+1:endnum])==0:
						id_gen = GenerateDoc(id_gen,source_pdfs,'NotRecon',None,jpg,0,0,0,0,pdf_pages_number,doc_num)
						print('\n ============ DOCUMENT NOT FOUND =========== \n')
	system('rm -r docclass/')
Example #48
0
def email_match(authors, emails):

    author_list = authors[:]  # create a copy of authors (not changing the input)

    # result = reordered emails list
    result = [''] * len(author_list)

    matrix = []
    for email in emails:
        ratios = []
        for author in author_list:
            try:
                email_id = email.split('@')[0]

                try:
                    lname = author.split(', ')[0].lower()
                    fname = author.split(', ')[1].lower()
                except:
                    fname = author.split(' ')[0].lower()
                    lname = author.split(' ')[1].lower()

                initial = ''.join(
                    [i[0].lower()
                     for i in re.findall(r"[\w']+", fname)]) + ''.join(
                         [j[0].lower() for j in re.findall(r"[\w']+", lname)])
                f_lastname = fname[0] + lname
                name = fname + lname

                ratios.append([
                    fuzz.partial_ratio(lname, email_id),
                    fuzz.partial_ratio(fname, email_id),
                    fuzz.partial_ratio(initial, email_id),
                    fuzz.partial_ratio(f_lastname, email_id),
                    fuzz.ratio(name, email_id)
                ])

                # ratios.append([fuzz.ratio(lname, email_id), fuzz.ratio(fname, email_id),
                #                fuzz.ratio(initial, email_id), fuzz.ratio(f_lastname, email_id),
                #                fuzz.ratio(name, email_id)])

            except:
                ratios.append([0] * 5)

        ratios = np.array(ratios)
        ratios = np.transpose(ratios)
        matrix.extend(ratios)

    matrix = np.array(matrix)

    indices = {}
    for score in sorted(set(matrix.flat), reverse=True):
        cord = np.where(matrix == score)
        for c in list(zip(cord[0], cord[1])):
            if c[0] // 5 not in indices.keys() and c[1] not in indices.values(
            ):
                indices[c[0] // 5] = c[1]

            if len(indices) == len(emails):
                break

    for k, v in indices.items():
        result[v] = emails[k]

    return result
Example #49
0
def get_categories(part_info: dict, supplier_only=False) -> list:
    ''' Find categories from part supplier data, use "somewhat automatic" matching '''
    categories = [None, None]

    try:
        supplier_category = str(part_info['category'])
        supplier_subcategory = str(part_info['subcategory'])
    except KeyError:
        return categories

    # Return supplier category, if match not needed
    if supplier_only:
        categories[0] = supplier_category
        categories[1] = supplier_subcategory
        return categories

    function_filter = False
    # TODO: Make 'filter_parameter' user defined?
    filter_parameter = 'Function Type'

    ### Check existing matches
    # Load inversed category map
    category_map = config_interface.load_supplier_categories_inversed(
        supplier_config_path=settings.CONFIG_DIGIKEY_CATEGORIES)

    try:
        for inventree_category in category_map.keys():
            for key, inventree_subcategory in category_map[
                    inventree_category].items():
                if supplier_subcategory == key:
                    categories[0] = inventree_category
                    # Check if filtering by function
                    if inventree_subcategory.startswith(
                            config_interface.FUNCTION_FILTER_KEY):
                        function_filter = True

                    # Save subcategory if not function filtered
                    if not function_filter:
                        categories[1] = inventree_subcategory

                    break
    except:
        pass

    ### Function Filter
    if not categories[1] and function_filter:
        cprint(
            f'[INFO]\tSubcategory is filtered using "{filter_parameter}" parameter',
            silent=settings.SILENT,
            end='')
        # Load parameter map
        parameter_map = config_interface.load_category_parameters(
            categories[0], settings.CONFIG_DIGIKEY_PARAMETERS)
        # Build compare list
        compare = []
        for supplier_parameter, inventree_parameter in parameter_map.items():
            if (supplier_parameter in part_info['parameters'].keys() and \
            inventree_parameter == filter_parameter):
                compare.append(part_info['parameters'][supplier_parameter])

        # Load subcategory map
        category_map = config_interface.load_supplier_categories(
            supplier_config_path=settings.CONFIG_DIGIKEY_CATEGORIES)[
                categories[0]]
        for inventree_subcategory in category_map.keys():
            for item in compare:
                fuzzy_match = fuzz.partial_ratio(inventree_subcategory, item)
                display_result = f'"{inventree_subcategory}" ?= "{item}"'.ljust(
                    50)
                cprint(f'{display_result} => {fuzzy_match}',
                       silent=settings.HIDE_DEBUG)
                if fuzzy_match >= settings.CATEGORY_MATCH_RATIO_LIMIT:
                    categories[1] = inventree_subcategory.replace(
                        config_interface.FUNCTION_FILTER_KEY, '')
                    break

            if categories[1]:
                cprint(f'\t[ PASS ]', silent=settings.SILENT)
                break

    if not categories[1] and function_filter:
        cprint(f'\t[ FAILED ]', silent=settings.SILENT)

    ### Automatic Match
    if not (categories[0] and categories[1]):
        # Load category map
        category_map = config_interface.load_supplier_categories(
            supplier_config_path=settings.CONFIG_DIGIKEY_CATEGORIES)

        def find_supplier_category_match(supplier_category: str):
            # Check for match with Inventree categories
            category_match = None
            subcategory_match = None

            for inventree_category in category_map.keys():
                fuzzy_match = fuzz.partial_ratio(supplier_category,
                                                 inventree_category)
                display_result = f'"{supplier_category}" ?= "{inventree_category}"'.ljust(
                    50)
                cprint(f'{display_result} => {fuzzy_match}',
                       silent=settings.HIDE_DEBUG)

                if  fuzzy_match < settings.CATEGORY_MATCH_RATIO_LIMIT and \
                 category_map[inventree_category]:
                    # Compare to subcategories
                    for inventree_subcategory in category_map[
                            inventree_category]:
                        fuzzy_match = fuzz.partial_ratio(
                            supplier_category, inventree_subcategory)
                        display_result = f'"{supplier_category}" ?= "{inventree_subcategory}"'.ljust(
                            50)
                        cprint(f'{display_result} => {fuzzy_match}',
                               silent=settings.HIDE_DEBUG)

                        if fuzzy_match >= settings.CATEGORY_MATCH_RATIO_LIMIT:
                            subcategory_match = inventree_subcategory
                            break

                if fuzzy_match >= settings.CATEGORY_MATCH_RATIO_LIMIT:
                    category_match = inventree_category
                    break

            return category_match, subcategory_match

        # Find category and subcategories match
        category, subcategory = find_supplier_category_match(supplier_category)
        if category:
            categories[0] = category
        if subcategory:
            categories[1] = subcategory

        # Run match with supplier subcategory
        if not categories[0] or not categories[1]:
            category, subcategory = find_supplier_category_match(
                supplier_subcategory)

        if category and not categories[0]:
            categories[0] = category
        if subcategory and not categories[1]:
            categories[1] = subcategory

    # Final checks
    if not categories[0]:
        cprint(
            f'[INFO]\tWarning: "{part_info["category"]}" did not match any supplier category ',
            silent=settings.SILENT)
    else:
        cprint(f'[INFO]\tCategory: "{categories[0]}"', silent=settings.SILENT)
    if not categories[1]:
        cprint(
            f'[INFO]\tWarning: "{part_info["subcategory"]}" did not match any supplier subcategory ',
            silent=settings.SILENT)
    else:
        cprint(f'[INFO]\tSubcategory: "{categories[1]}"',
               silent=settings.SILENT)

    return categories
Example #50
0
def hosts(year):
    year = str(year)
    lst = []
    file_name = 'pruned_tweets_' + year + '.json'
    with open(file_name, encoding="utf8") as infile:
        for line in infile:
            text = json.loads(line)['text']
            lst.append(text.lower())

    infile.close()

    # run it through tagger w/ "host" tweets
    relevant_tweets = []
    for tweet in lst:
        if "hosted" in tweet:
            relevant_tweets.append(tweet)
    #print(len(relevant_tweets), " is the number of tweets containing host")

    #print("\n\nTagging them with spacy now...")
    countt = 0
    shorter_list = []
    tinacount = 0
    yikes = False

    while countt < len(relevant_tweets):
        ppl_lst = tagger(relevant_tweets[countt])
        if ppl_lst != []:
            for person in ppl_lst:
                if "'" in person:
                    person = person[:person.index("'")]
                if "’" in person:
                    person = person[:person.index("’")]

                flag = False
                for ii in range(len(shorter_list)):
                    if person in shorter_list[ii][
                            0] and "http" not in person and len(
                                person.split()) < 3:
                        shorter_list[ii][1] += 1
                        flag = True
                if not flag and "http" not in person and len(
                        person.split()) < 3:
                    shorter_list.append([person, 1])
        countt += 1

    shorter_list.sort(key=lambda x: -x[1])
    shorter_list = shorter_list[:10]

    curr = len(shorter_list) - 1
    new_shorter_list = []
    while curr >= 0:
        name = shorter_list[curr][0]
        score = shorter_list[curr][1]
        chk = curr - 1
        found = False
        while chk >= 0:
            name2 = shorter_list[chk][0]
            #check if fuzz is above a threshold
            threshold = max(fuzz.partial_ratio(name, name2),
                            fuzz.ratio(name, name2))
            if threshold > 85:
                found = True
                #name at top is not a good name
                if len(name2.split()) < 2:
                    shorter_list[chk][0] = name
                shorter_list[chk][1] += score
                break
            chk -= 1
        if found == False:
            new_shorter_list.append([name, score])
        curr -= 1

    new_shorter_list.sort(key=lambda x: -x[1])

    if new_shorter_list[0][1] > 4 * new_shorter_list[1][1]:
        #print("Host is", new_shorter_list[0][0].title())
        return [new_shorter_list[0][0].title()]
    else:
        return [new_shorter_list[0][0].title(), new_shorter_list[1][0].title()]
Example #51
0
    def fitness(self, txt, qst):
        self.qstType(qst)
        if self.thisType == 'UK':
            _, sim = self.bin_answer(qst, txt)
            return sim > self.threshold
        qstType = self.thisType
        self.candidateAnswer = []
        self.candidateSentence = []

        extendList = []

        for thisSent in [txt]:
            extendList.append(thisSent)
            thisParseTree = self.qgPipeline.getParseTree(thisSent)
            no_conj_list = self.qgPipeline.splitConj(thisParseTree)
            simpl_sents = self.qgPipeline.simplify_sentence(no_conj_list)

            for i in simpl_sents:
                extendList.append(i)
        # pdb.set_trace()

        for txt in extendList:
            # print(txt)
            tree = self.sNLP.parser_sents([
                txt,
            ])
            for i in tree:
                self.dropTotal = 0
                self.dropFlag = 1
                while self.dropFlag:
                    self.findFlag = 0
                    nowTree = copy.deepcopy(i)
                    self.dropTime = 0
                    nowTree = self.dropFragment(nowTree, qstType)
                    if self.dropTime <= self.dropTotal:
                        self.dropFlag = 0
                    self.dropTotal += 1

        best_dis = 0
        best_ans = '_'
        best_candi = None
        best_sen = None

        for i in range(len(self.candidateSentence)):
            nowSentence = ' '.join(self.candidateSentence[i])
            score = fuzz.partial_ratio(self.qstSim, nowSentence)
            this_ans = ' '.join(self.candidateAnswer[i])
            # print(this_ans, best_ans, score, best_dis)
            if self.qstSim == None: continue
            if this_ans == None: continue
            if (score >= best_dis):
                if score == best_dis and len(this_ans) >= len(
                        best_ans) and self.thisType in ['WHADVP', 'WHPP']:
                    continue
                if score == best_dis and len(this_ans) <= len(
                        best_ans) and self.thisType in ['WHNP']:
                    continue
                best_dis = score
                best_sen = nowSentence
                best_ans = this_ans

        return self.threshold < best_dis
def run_topical_analysis(string):
    '''Searches for a user given string and performs sentiment analysis for it.'''
    sia = SIA()

    print("Searching for " + string + "...")
    with open(r"out/dataset.csv", "r") as infile_posts:
        with open(r"out/dataset_comments.csv", "r") as infile_comments:
            post_reader = csv.reader(infile_posts)
            comment_reader = csv.reader(infile_comments)

            include_list = []
            positive_count = 0
            negative_count = 0
            total_count = 0

            for row in post_reader:
                print("Analyzing posts and comment rows: " +
                      str(total_count + 1),
                      end="\r")

                match_post_title = fuzz.partial_ratio(string, row[0])
                match_post_flair = fuzz.partial_ratio(string, row[4])
                if row[6] == "''":
                    match_post_selftext = 0
                    if match_post_flair >= 85 or match_post_title >= 85:
                        resultult_0 = sia.polarity_scores(row[0])
                        resultult_1 = None
                        include_list.append(row[7])
                    else:
                        continue
                else:
                    match_post_selftext = fuzz.partial_ratio(string, row[6])
                    if match_post_flair >= 85 or match_post_title >= 85 or match_post_selftext >= 85:
                        resultult_0 = sia.polarity_scores(row[0])
                        resultult_1 = sia.polarity_scores(row[4])
                        include_list.append(row[7])
                    else:
                        continue

                if resultult_0['compound'] > 0.2:
                    with open(r"out/positive_list_" + "%r" % string + r".txt",
                              "a",
                              encoding="utf-8") as outfile_posts:
                        outfile_posts.write(row[0] + "\n")
                        positive_count += 1
                        total_count += 1

                elif resultult_0['compound'] < -0.2:
                    with open(r"out/negative_list_" + "%r" % string + r".txt",
                              "a",
                              encoding="utf-8") as outfile_posts:
                        outfile_posts.write(row[0] + "\n")
                        negative_count += 1
                        total_count += 1

                if resultult_1 is not None:
                    if resultult_1['compound'] > 0.2:
                        with open(r"out/positive_list_" + "%r" % string +
                                  r".txt",
                                  "a",
                                  encoding="utf-8") as outfile_posts:
                            outfile_posts.write(row[0] + "\n")
                            positive_count += 1

                    elif resultult_1['compound'] < -0.2:
                        with open(r"out/negative_list_" + "%r" % string +
                                  r".txt",
                                  "a",
                                  encoding="utf-8") as outfile_posts:
                            outfile_posts.write(row[0] + "\n")
                            negative_count += 1

            for row in comment_reader:
                print("Analyzing posts and comment rows: " +
                      str(total_count + 1),
                      end="\r")

                if row[1] in include_list:
                    total_count += 1

                    resultult = sia.polarity_scores(row[0])

                    if resultult['compound'] > 0.2:
                        with open(r"out/positive_list_" + "%r" % string +
                                  r".txt",
                                  "a",
                                  encoding="utf-8") as outfile_comments:
                            outfile_comments.write(row[0] + "\n")
                            positive_count += 1

                    elif resultult['compound'] < -0.2:
                        with open(r"out/negative_list_" + "%r" % string +
                                  r".txt",
                                  "a",
                                  encoding="utf-8") as outfile_comments:
                            outfile_comments.write(row[0] + "\n")
                            negative_count += 1

            print("\nDone.")

            plot_word_types(total_count, negative_count, positive_count,
                            string)
Example #53
0
    def answer(self, txtList, qst):
        self.head = word_tokenize(qst)[0].lower()

        self.qstType(qst)
        if self.thisType == 'UK':

            best_score = 0
            best_ans = 'Yes'
            best_sent = '_'
            for txt in txtList:
                ans, sim = self.bin_answer(qst, txt)
                if sim > best_score:
                    best_ans = ans
                    best_score = sim
                    best_sent = txt
            #print('=======')
            #print(best_sent)
            #print(qst)
            print(best_ans + '.')
            #print(best_score)
            #print('=======')
            return

        qstType = self.thisType
        self.candidateAnswer = []
        self.candidateSentence = []

        extendList = []

        for thisSent in txtList:
            thisSent = self.preProcessText(thisSent)
            if (len(word_tokenize(thisSent)) < 4
                    or len(word_tokenize(thisSent)) > 25):
                continue

            extendList.append(thisSent)
            thisParseTree = self.qgPipeline.getParseTree(thisSent)

            no_conj_list = self.qgPipeline.splitConj(thisParseTree)
            simpl_sents = self.qgPipeline.simplify_sentence(no_conj_list)

            for i in simpl_sents:
                extendList.append(i)
        # pdb.set_trace()

        for txt in extendList:
            # print(txt)
            tree = self.sNLP.parser_sents([
                txt,
            ])
            for i in tree:
                self.dropTotal = 0
                self.dropFlag = 1
                while self.dropFlag:
                    self.findFlag = 0
                    nowTree = copy.deepcopy(i)
                    self.dropTime = 0
                    nowTree = self.dropFragment(nowTree, qstType)
                    if self.dropTime <= self.dropTotal:
                        self.dropFlag = 0
                    self.dropTotal += 1

        best_dis = 0
        best_candi = None
        best_sen = None
        best_ans = '_'

        for i in range(len(self.candidateSentence)):
            nowSentence = ' '.join(self.candidateSentence[i])
            # print(nowSentence)
            # print(self.qstSim)
            score = fuzz.partial_ratio(self.qstSim, nowSentence)
            # print(score)
            # print('----------')

            this_ans = ' '.join(self.candidateAnswer[i])
            # print(this_ans, best_ans, score, best_dis)
            if self.qstSim == None: continue
            if this_ans == None: continue
            if (score >= best_dis):
                if score == best_dis and len(this_ans) >= len(
                        best_ans) and self.thisType in ['WHADVP', 'WHPP']:
                    continue
                if score == best_dis and len(this_ans) <= len(
                        best_ans) and self.thisType in ['WHNP']:
                    continue
                if self.head == 'who':
                    ners = getExhaustiveNERs(this_ans)
                    #print(this_ans, ners[0])
                    if 'PERSON' not in ners[0] and 'ORGANIZATION' not in ners[
                            0]:
                        if score - best_dis < 10:
                            continue
                        else:
                            score = score - 10
                if self.head == 'when':
                    ners = getExhaustiveNERs(this_ans)
                    if 'DATE' not in ners[0]:
                        if score - best_dis < 10:
                            continue
                        else:
                            score = score - 10
                if self.head == 'where':
                    ners = getExhaustiveNERs(this_ans)
                    if 'LOCATION' not in ners[0] and 'CITY' not in ners[
                            0] and 'ORGANIZATION' not in ners[
                                0] and 'STATE_OR_PROVINCE' not in ners[
                                    0] and 'COUNTRY' not in ners[0]:
                        if score - best_dis < 10:
                            continue
                        else:
                            score = score - 10
                best_dis = score

                best_sen = nowSentence
                best_ans = this_ans

        #print('++++++++++++++++++')
        #print(qst)
        #print(best_dis)
        #print(best_sen)
        if best_ans == '_':
            print('I cannot answer that question: ' + qst)
        else:
            print(best_ans.capitalize() + '.')
Example #54
0
    def bin_answer(self, question, sent):
        #print(question, sent)

        qstTree = self.sNLP.dependency_parse(question)
        qstTree = qstTree.__next__()
        qstTree = list(qstTree.triples())
        sentTree = self.sNLP.dependency_parse(sent)
        sentTree = sentTree.__next__()
        sentTree = list(sentTree.triples())
        #print(qstTree, sentTree)
        qstSub = []
        sentSub = []
        flag = False
        neg = False
        for x in qstTree:
            # print(x)
            if x[1] in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass']:
                qstSub.append(self.parseDep(x))
            if x[1] == 'neg':
                neg = True
        for x in sentTree:
            if x[1] in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass']:
                sentSub.append(self.parseDep(x))
                if self.parseDep(x) in qstSub:
                    flag = True
        #print(qstSub)
        #print(sentSub)

        if flag:
            if neg:
                return ('No', 100)
            else:
                return ('Yes', 100)

        bin_tags = set(
            ["did", 'do', 'does', 'are', 'is', 'have', 'was', 'were', 'has'])
        question = question.lower()
        sent = sent.lower()
        q_tokens = word_tokenize(question)
        s_tokens = word_tokenize(sent)
        negations = set(['not', 'never', "aren't"])
        ans = ''
        # case 1: negations
        for neg in negations:
            if (neg in q_tokens) and (neg not in s_tokens):
                if ans == "No":
                    ans = "Yes"
                else:
                    ans = "No"
            if (neg in q_tokens) and (neg in s_tokens):
                if ans == "Yes":
                    ans = "No"
                else:
                    ans = "Yes"
        # case 2: similarity
        sim = fuzz.partial_ratio(question, sent)
        if sim > 90:
            ans = "Yes"
        else:
            ans = "No"
        return (ans, sim)
Example #55
0
        max_fuzz_token_sort_ratio = 0.0

        for parag_sent in segmenter.split(paragraph):
            parag_stems = stemmize(parag_sent)

            # chars_dist = edit_distance( normalize_word2(quest), normalize_word2(parag), substitution_cost=1, transpositions=True)
            # min_chars_edit_dist = min( min_chars_edit_dist, chars_dist )
            shingles3 = distance.jaccard( quest_shingles3, get_shingles3(parag_stems))

            fuzz_qratio = 0.01 * fuzz.QRatio(quest_stems, parag_stems)
            max_fuzz_qratio = max( max_fuzz_qratio, fuzz_qratio )

            fuzz_WRatio = 0.01 * fuzz.WRatio(quest_stems, parag_stems)
            max_fuzz_WRatio = max( max_fuzz_WRatio, fuzz_WRatio )

            fuzz_partial_ratio = 0.01 * fuzz.partial_ratio(quest_stems, parag_stems)
            max_fuzz_partial_ratio = max( max_fuzz_partial_ratio, fuzz_partial_ratio )

            fuzz_partial_token_set_ratio = 0.01 * fuzz.partial_token_set_ratio(quest_stems, parag_stems)
            max_fuzz_partial_token_set_ratio = max( max_fuzz_partial_token_set_ratio, fuzz_partial_token_set_ratio)

            fuzz_partial_token_sort_ratio = 0.01 * fuzz.partial_token_sort_ratio(quest_stems, parag_stems)
            max_fuzz_partial_token_sort_ratio = max( max_fuzz_partial_token_sort_ratio, fuzz_partial_token_sort_ratio )

            fuzz_token_set_ratio = 0.01 * fuzz.token_set_ratio(quest_stems, parag_stems)
            max_fuzz_token_set_ratio = max( max_fuzz_token_set_ratio, fuzz_token_set_ratio)

            fuzz_token_sort_ratio = 0.01 * fuzz.token_sort_ratio(quest_stems, parag_stems)
            max_fuzz_token_sort_ratio = max( max_fuzz_token_sort_ratio, fuzz_token_sort_ratio )

        df.loc[index, 'max_shingles3_str'] = max_shingles3
Example #56
0
def main():
    """Entry point to the script

    This function does the following:
    a. calls the file_download function to download the files from a web location
    b. generates file download success report
    c. checks for plagiarism
    d. generates plagiarism check report
    """
    import datetime

    try:
        test_takers_list = test_takers()
        types_of_encoding = ["utf-8", "cp1252", "cp850", "utf8"]

        # creating required directories
        if not os.path.exists(rep_dir):
            os.mkdir(rep_dir)

        if not os.path.exists(answer_folder):
            os.mkdir(answer_folder)

        date_time_stamp_raw = str(datetime.datetime.now())
        date_time_stamp = date_time_stamp_raw.replace(":", ".")

        ans_folder_name = os.path.join(answer_folder,
                                       "Answers_" + date_time_stamp)
        if not os.path.exists(ans_folder_name):
            os.mkdir(ans_folder_name)

        rep_folder_name = os.path.join(rep_dir, "Report_" + date_time_stamp)
        if not os.path.exists(rep_folder_name):
            os.mkdir(rep_folder_name)

        # downloading answers and working on success report file
        with open(
                os.path.join(rep_dir, rep_folder_name,
                             "report " + date_time_stamp + ".csv"),
                "w") as report_file:
            report_file.write("_" * 75 + "\n")
            report_file.write(" Test Taker " + "| Tasks " +
                              "  |                 Status                |" +
                              "  File Name  " + "\n")
            report_file.write("-" * 75 + "\n")

            for test_taker in tqdm(test_takers_list):
                for tasks_folder in tasks_folders:
                    if "." in test_taker:
                        usr_folder = web_url + test_taker
                        folder_url = web_url + test_taker + "/" + tasks_folder
                    else:
                        usr_folder = web_url + "~" + test_taker
                        folder_url = web_url + "~" + test_taker + "/" + tasks_folder
                    if file_fldr_exists(usr_folder):
                        folder_name = file_download(folder_url)

                        if folder_name != 0:
                            os.rename(tasks_folder, tasks_folder + ".html")
                            existing_files = filenames_from_html(
                                os.path.join(os.getcwd(),
                                             tasks_folder + ".html"))
                            os.remove(
                                os.path.join(os.getcwd(),
                                             tasks_folder + ".html"))
                            if len(existing_files) > 0:
                                for file_name in existing_files:
                                    file_url = folder_url + "/" + file_name
                                    file_name = file_download(file_url)

                                    dest_usr = os.path.join(
                                        ans_folder_name, test_taker)
                                    if not os.path.exists(dest_usr):
                                        os.mkdir(dest_usr)

                                    dest_task = os.path.join(
                                        ans_folder_name, test_taker,
                                        tasks_folder)
                                    if not os.path.exists(dest_task):
                                        os.mkdir(dest_task)

                                    try:
                                        move_file(file_name, test_taker,
                                                  tasks_folder,
                                                  ans_folder_name)
                                        # Report specific data
                                        success_text_report = "  " + test_taker + "  | " + tasks_folder + " |      Files successfully downloaded      | " + file_name + "\n"
                                        report_file.write(success_text_report)
                                    except Exception:
                                        success_text_report = "  " + str(
                                            test_taker
                                        ) + "  | " + str(
                                            tasks_folder
                                        ) + " |        Files too big to download        | " + str(
                                            file_name) + "\n"
                                        report_file.write(success_text_report)
                                        pass

                            else:
                                # Report specific data
                                no_files_found = "  " + test_taker + "  | " + tasks_folder + " |No files found in the folder to download |" + "\n"
                                report_file.write(no_files_found)

                        else:
                            # Report specific data
                            folder_not_found = "  " + test_taker + "  | " + tasks_folder + " |      Folder named " + tasks_folder + " not found       |" + "\n"
                            report_file.write(folder_not_found)

                    else:
                        # Report specific data
                        error_dwnld_file = "  " + test_taker + "  | " + tasks_folder + " |Can't access url or user folder not found|" + "\n"
                        report_file.write(error_dwnld_file)

                report_file.write(" " + "." * 75 + "\n")
            report_file.write(" " + "-" * 75 + "\n")
            report_file.close()

        if easygui.ynbox(
                "Done downloading files and creating report. \n\nDo you want to run the plagiarism check now?",
                "Run plagiarism check?",
                choices=("[<F1>]Yes", "[<F2>]No"),
                default_choice="[<F1>]Yes",
                cancel_choice="[<F2>]No"):

            if not hash_check:
                combs = {}
                final_results = {}
                for Answer_folder in tqdm(
                        retrieve_folder_content(answer_folder)):
                    for Student_folder in retrieve_folder_content(
                            Answer_folder):
                        for Task_folder in retrieve_folder_content(
                                Student_folder):
                            for Ans_file in retrieve_folder_content(
                                    Task_folder, True):
                                # Student_folder2 is the student folder to compare the Ans_file content with
                                for Student_folder2 in retrieve_folder_content(
                                        Answer_folder):
                                    if Student_folder2 != Student_folder:
                                        # Task_folder2 is the Task folder inside the Student_folder2 to compare the Ans_file with
                                        for Task_folder2 in retrieve_folder_content(
                                                Student_folder2):
                                            if os.path.basename(
                                                    Task_folder2
                                            ) == os.path.basename(Task_folder):
                                                stu_fol_1 = os.path.basename(
                                                    Student_folder)
                                                stu_fol_2 = os.path.basename(
                                                    Student_folder2)
                                                # stu_tskfile_1 = os.path.basename(Ans_file)
                                                temp_comb = [
                                                    os.path.basename(
                                                        Task_folder) + "_" +
                                                    stu_fol_1 + "_" +
                                                    stu_fol_2,
                                                    os.path.basename(
                                                        Task_folder) + "_" +
                                                    stu_fol_2 + "_" + stu_fol_1
                                                ]
                                                if temp_comb[0] not in combs:
                                                    for Ans_file2 in retrieve_folder_content(
                                                            Task_folder2,
                                                            True):
                                                        # with open(Ans_file2, 'r') as fp2:
                                                        #     with open(Ans_file, 'r') as fp:
                                                        #         s = fp.read()
                                                        #         s_tocomp = fp2.read()
                                                        #         result = fuzz.ratio(s, s_tocomp)
                                                        #         combs.update(
                                                        #             {temp_comb[0]: result, temp_comb[1]: result})
                                                        #         final_results.update({temp_comb[0]: result})

                                                        for encoding_type in types_of_encoding:
                                                            if temp_comb[
                                                                    0] not in combs:
                                                                with codecs.open(
                                                                        Ans_file,
                                                                        encoding
                                                                        =encoding_type,
                                                                        errors=
                                                                        'replace'
                                                                ) as fp:
                                                                    for encoding_type in types_of_encoding:
                                                                        if temp_comb[
                                                                                0] not in combs:
                                                                            with codecs.open(
                                                                                    Ans_file2,
                                                                                    encoding
                                                                                    =encoding_type,
                                                                                    errors
                                                                                    ='replace'
                                                                            ) as fp2:
                                                                                # with open(Ans_file, 'r') as fp:
                                                                                # with open(Ans_file2, 'r') as fp2:

                                                                                s = fp.read(
                                                                                )
                                                                                # print(s.encode('utf-8'))
                                                                                s_tocomp = fp2.read(
                                                                                )
                                                                                # print(s_tocomp.encode('utf-8'))
                                                                                if type_of_check == "Simple Ratio":
                                                                                    result = fuzz.ratio(
                                                                                        s,
                                                                                        s_tocomp
                                                                                    )
                                                                                elif type_of_check == "Partial Ratio":
                                                                                    result = fuzz.partial_ratio(
                                                                                        s,
                                                                                        s_tocomp
                                                                                    )
                                                                                elif type_of_check == "Token Sort Ratio":
                                                                                    result = fuzz.token_sort_ratio(
                                                                                        s,
                                                                                        s_tocomp
                                                                                    )
                                                                                elif type_of_check == "Token Set Ratio":
                                                                                    result = fuzz.token_set_ratio(
                                                                                        s,
                                                                                        s_tocomp
                                                                                    )

                                                                                combs.update({
                                                                                    temp_comb[0]:
                                                                                    result,
                                                                                    temp_comb[1]:
                                                                                    result
                                                                                })
                                                                                final_results.update({
                                                                                    temp_comb[0]:
                                                                                    result
                                                                                })
                #print(final_results)

            else:
                combs = {}
                final_results = {}
                for Answer_folder in tqdm(
                        retrieve_folder_content(answer_folder)):
                    for Student_folder in retrieve_folder_content(
                            Answer_folder):
                        ## html_td = "<tr> <td> " + student_folder + "</td>"
                        for Task_folder in retrieve_folder_content(
                                Student_folder):
                            for Ans_file in retrieve_folder_content(
                                    Task_folder, True):
                                # Student_folder2 is the student folder to compare the Ans_file content with
                                for Student_folder2 in retrieve_folder_content(
                                        Answer_folder):
                                    if Student_folder2 != Student_folder:
                                        # Task_folder2 is the Task folder inside the Student_folder2 to compare the Ans_file with
                                        for Task_folder2 in retrieve_folder_content(
                                                Student_folder2):
                                            if os.path.basename(
                                                    Task_folder2
                                            ) == os.path.basename(Task_folder):
                                                stu_fol_1 = os.path.basename(
                                                    Student_folder)
                                                stu_fol_2 = os.path.basename(
                                                    Student_folder2)
                                                # stu_tskfile_1 = os.path.basename(Ans_file)
                                                temp_comb = [
                                                    os.path.basename(
                                                        Task_folder) + "_" +
                                                    stu_fol_1 + "_" +
                                                    stu_fol_2,
                                                    os.path.basename(
                                                        Task_folder) + "_" +
                                                    stu_fol_2 + "_" + stu_fol_1
                                                ]
                                                if temp_comb[0] not in combs:
                                                    for Ans_file2 in retrieve_folder_content(
                                                            Task_folder2,
                                                            True):

                                                        for encoding_type in types_of_encoding:
                                                            if temp_comb[
                                                                    0] not in combs:
                                                                with codecs.open(
                                                                        Ans_file,
                                                                        encoding
                                                                        =encoding_type,
                                                                        errors=
                                                                        'replace'
                                                                ) as fp:
                                                                    for encoding_type in types_of_encoding:
                                                                        if temp_comb[
                                                                                0] not in combs:
                                                                            with codecs.open(
                                                                                    Ans_file2,
                                                                                    encoding
                                                                                    =encoding_type,
                                                                                    errors
                                                                                    ='replace'
                                                                            ) as fp2:
                                                                                # with open(Ans_file, 'r') as fp:
                                                                                # with open(Ans_file2, 'r') as fp2:

                                                                                s_buf_raw = fp.read(
                                                                                )
                                                                                s_buf = s_buf_raw.encode(
                                                                                    'utf-8'
                                                                                )
                                                                                hasher = hashlib.md5(
                                                                                )
                                                                                hasher.update(
                                                                                    s_buf
                                                                                )
                                                                                s = hasher.digest(
                                                                                )

                                                                                s_tocomp_buf_raw = fp2.read(
                                                                                )
                                                                                s_tocomp_buf = s_tocomp_buf_raw.encode(
                                                                                    'utf-8'
                                                                                )
                                                                                hasher = hashlib.md5(
                                                                                )
                                                                                hasher.update(
                                                                                    s_tocomp_buf
                                                                                )
                                                                                s_tocomp = hasher.digest(
                                                                                )

                                                                                if type_of_check == "Simple Ratio":
                                                                                    result = fuzz.ratio(
                                                                                        s,
                                                                                        s_tocomp
                                                                                    )
                                                                                elif type_of_check == "Partial Ratio":
                                                                                    result = fuzz.partial_ratio(
                                                                                        s,
                                                                                        s_tocomp
                                                                                    )
                                                                                elif type_of_check == "Token Sort Ratio":
                                                                                    result = fuzz.token_sort_ratio(
                                                                                        s,
                                                                                        s_tocomp
                                                                                    )
                                                                                elif type_of_check == "Token Set Ratio":
                                                                                    result = fuzz.token_set_ratio(
                                                                                        s,
                                                                                        s_tocomp
                                                                                    )

                                                                                combs.update({
                                                                                    temp_comb[0]:
                                                                                    result,
                                                                                    temp_comb[1]:
                                                                                    result
                                                                                })
                                                                                final_results.update({
                                                                                    temp_comb[0]:
                                                                                    result
                                                                                })
                                                                                # final_results.update({"comb": temp_comb[0], "similarity": result})
                                                                                ## html_td = html_td + </tr>
                #print(final_results)

            # Creating HTML file with plagiarism check results
            h_H2 = "<h2> %s </h2>"
            h_div = "<div> %s </div>"
            t_table = "<table> %s </table>"
            t_row = "<tr> %s </tr>"
            t_header = "<th bgcolcor=\"#F5F1F1\"> %s </th>"
            t_data = "<td> %s </td>"
            t_data_red = "<td bgcolor=\"#FF6747\"> %s </td>"
            html_beg = """
            <html>
                <head>
                    <title>Plagiarism Results</title>
                        <style>
                            table {
                                font-family: arial, sans-serif;
                                border-collapse: collapse;
                                width: 100%;
                            }

                            td, th {
                                border: 1px solid #dddddd;
                                text-align: left;
                                padding: 8px;
                            }

                            h1 { color: #111; font-family: 'Helvetica Neue', sans-serif;
                            font-size: 80px; font-weight: bold; letter-spacing: -1px;
                            line-height: 1; text-align: center;
                            }

                            h2 { color: #111; font-family: 'Open Sans', sans-serif;
                            font-size: 30px; font-weight: bold; line-height: 32px;
                            margin: 0 0 10px; text-align: left;
                            }

                            tr:nth-child(even) {
                                background-color: #dddddd;
                            }
                        </style>
                </head>
                <body>
                <h1> Plagiarism Check Results </h1>
            """

            html_end = """
                </body>
            <html>

            """

            html_div = ""

            for tasks_folder in tasks_folders:
                # task_spec_dict = {}
                before_ = []
                _after = []
                for key in final_results:
                    if tasks_folder in key:
                        # task_spec_dict.update({key: final_results[key]})
                        if key.split("_")[1] not in _after:
                            before_.append(key.split("_")[1])
                        if key.split("_")[2] not in before_:
                            _after.append(key.split("_")[2])

                #print(task_spec_dict)
                before_ = list(set(before_))
                _after = list(set(_after))

                heading = h_H2 % tasks_folder

                tables = ""
                table_rows = ""
                table_headers = ""

                table_headers += t_header % "Test Takers"

                for b in before_:
                    table_headers += t_header % b
                table_rows += t_row % table_headers
                table_headers = ""

                for a in _after:
                    new_data = t_data % a
                    for b in before_:
                        if a != b:
                            try:
                                sim_res = final_results[tasks_folder + "_" +
                                                        b + "_" + a]
                            except Exception as e:
                                sim_res = 0
                                #print(e)
                                pass
                            if sim_res > 80:
                                new_data += t_data_red % sim_res
                            else:
                                new_data += t_data % sim_res
                    table_rows += t_row % new_data
                tables += tables + t_table % table_rows

                html_div += heading + h_div % tables + "<br>" + "<br>"

            whole_html = html_beg + html_div + html_end

            f = open("plagiarism_check.html", "w+")
            f.write(whole_html)
            f.close()

            # # Generating csv from dictionary combs
            # final_results_json = json.dumps(final_results)
            # final_results = json.loads(final_results_json)
            #
            # f = csv.writer(open("plagiarism_check.csv", "wb+"))
            #
            # # Write CSV Header
            # f.writerow(["student1_student2_task#", "similarity"])
            #
            # for final_result in final_results:
            #     f.writerow([final_result.key(),
            #                 final_result.value()])

        else:
            pass
        easygui.msgbox(
            "Success! running the script \nCheck download report in Reports folder.",
            "Run Result")

    except Exception as err:
        easygui.msgbox("Error!" + "\n" + str(err), "Run Result")
        raise SystemExit(str(err))
# Predicting a new result
y_pred_p1_rf = regressor.predict(X_test)
y_pred_p1_rf = round_val(y_pred_p1_rf)
        
cm_p1_rf = confusion_matrix(y_test, y_pred_p1_rf)
accuracy(cm_p1_rf) 
"70%"


""""""""""""""" Phase 2: Feature Extraction """""""""""""""""""""""""""""

data_p2 = data

data_p2 ['fw_qratio'] = data_p2.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
data_p2 ['fw_WRatio'] = data_p2.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
data_p2 ['fw_par_ratio'] = data_p2.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
data_p2 ['fw_par_token_set_ratio'] = data_p2.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
data_p2 ['fw_par_token_sort_ratio'] = data_p2.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
data_p2 ['fw_token_set_ratio'] = data_p2.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
data_p2 ['fw_token_sort_ratio'] = data_p2.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)

data_p2.to_csv('quora_features_extraction_p2.csv', index=False)


list(data_p2)
data_p2_upd = data_p2
data_p2_upd = data_p2_upd.drop(['id','question1','question2'], axis=1)
list(data_p2_upd)

data_p1_p2  = pd.concat([data.reset_index(drop=True), data_p2_upd], axis=1)
list(data_p1_p2)
Example #58
0
def linking_data_one_file(id2question, index_ent, index_reach, index_names,
                          ent_resultpath):
    ent_lineids, id2queries = get_query_texts(
        ent_resultpath)  # ent_lineids may have some examples missing
    id2mids = {}
    data = defaultdict(list)

    for i, lineid in enumerate(ent_lineids):
        if not lineid in id2question.keys():
            continue

        if i % 1000 == 0:
            print("line {}".format(i))

        truth_mid, truth_name, truth_rel, question = id2question[lineid]
        queries = id2queries[lineid]
        C = []  # candidate entities
        C_counts = []
        C_scored = []

        for query_text in queries:
            query_tokens = query_text.split()
            N = min(len(query_tokens), 3)

            for n in range(N, 0, -1):
                ngrams_set = find_ngrams(query_tokens, n)
                # print("ngrams_set: {}".format(ngrams_set))
                for ngram_tuple in ngrams_set:
                    ngram = " ".join(ngram_tuple)
                    # unigram stopwords have too many candidates so just skip over
                    if ngram in stopwords:
                        continue
                    # print("ngram: {}".format(ngram))
                    try:
                        cand_mids = index_ent[ngram]  # search entities
                    except:
                        continue
                    C.extend(cand_mids)
                if (len(C) > 0):
                    break  #early termination

            for mid in set(C):
                count_mid = C.count(
                    mid)  # count number of times mid appeared in C
                C_counts.append((mid, count_mid))

            for mid, count_mid in C_counts:
                if mid in index_names.keys():
                    cand_ent_name = pick_best_name(question, index_names[mid])
                    if cand_ent_name == truth_name:  # if name is correct, we are good
                        data['exact_name_match'].append(1)
                    else:
                        data['exact_name_match'].append(0)

                    # if not exact match but close match
                    if cand_ent_name != truth_name and fuzz.ratio(
                            cand_ent_name, truth_name) >= 60:
                        data['partial_name_match'].append(1)
                    else:
                        data['partial_name_match'].append(0)

                    if mid == truth_mid:
                        data['true_label'].append(1)
                    else:
                        data['true_label'].append(0)

                    data['lineid'].append(lineid)
                    data['query'].append(query_text)
                    data['length_name'].append(len(cand_ent_name.split()))
                    data['length_question'].append(len(question.split()))
                    data['length_query'].append(len(query_tokens))
                    data['tf'].append(count_mid)
                    data['idf'].append(
                        calc_idf(question, cand_ent_name, index_ent))
                    data['sques'].append(
                        fuzz.ratio(cand_ent_name, question) / 100.0)
                    data['squer'].append(
                        fuzz.ratio(cand_ent_name, query_text) / 100.0)
                    data['pques'].append(
                        fuzz.partial_ratio(cand_ent_name, question) / 100.0)
                    data['pquer'].append(
                        fuzz.partial_ratio(cand_ent_name, query_text) / 100.0)

    df = pd.DataFrame(data)
    return df
Example #59
0
.intersection(set(str(x['question2'])
.lower().split()))), axis=1)
fs_1 = ['len_q1', 'len_q2', 'diff_len', 'len_char_q1', 
        'len_char_q2', 'len_word_q1', 'len_word_q2',     
        'common_words']
pprint(fs_1)
print('---- Computed ----')

print('---- Computing fuzzy features ----')
data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(
    str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(
str(x['question1']), str(x['question2'])), axis=1)

data['fuzz_partial_ratio'] = data.apply(lambda x: 
fuzz.partial_ratio(str(x['question1']), 
str(x['question2'])), axis=1)

data['fuzz_partial_token_set_ratio'] = data.apply(lambda x:
fuzz.partial_token_set_ratio(str(x['question1']), 
str(x['question2'])), axis=1)

data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: 
fuzz.partial_token_sort_ratio(str(x['question1']), 
str(x['question2'])), axis=1)

data['fuzz_token_set_ratio'] = data.apply(lambda x: 
fuzz.token_set_ratio(str(x['question1']), 
str(x['question2'])), axis=1)
data['fuzz_token_sort_ratio'] = data.apply(lambda x: 
fuzz.token_sort_ratio(str(x['question1']), 
str(x['question2'])), axis=1)
Example #60
0
def fuzzy(i):
    j = fuzz.partial_ratio(i[0].upper(), i[1].upper())
    return j