Example #1
0
def fw_partial_token_set_ratio(question1, question2):
    fuzzy = []
    for q1, q2 in zip(question1, question2):
        partial_ratio = fuzz.partial_token_set_ratio(str(q1), str(q2)) / 100
        fuzzy.append([partial_ratio])
    print("Created fuzz partial_token_set_ratio feature")
    return np.array(fuzzy)
    def fit(self, X):
        d = defaultdict(list)    
        for name in set(X):
            name = name.strip()
            name2 = re.sub(self.ex_lookup[self.stop_words],'',name).strip()
            if len(d)==0:
                d[' '.join(name2.split()[:2])].append(name)
            else:
                score = 0
                group = 'string'
                for key in d.iterkeys():
                    test = fuzz.partial_token_set_ratio(name2,key)
                    if test>score:
                        score, group = test, key
                        group = key
                if score>70:
                    d[group].append(name)
                else:
                    d[' '.join(name2.split()[:2])].append(name)   

        self.MAP = {}
        for k,v in d.iteritems():
            values = []
            for name in v:
                values.append((X.count(name), name))
            simp = sorted(values, reverse=True)[:1][0]
            for name in v:
                self.MAP[name] = simp[1]
        return self
Example #3
0
def extractFeatures(data,filename,column,bagOfWords):
    print('Extract features for',filename)
    data[column].fillna('NA', inplace=True)
    for word in bagOfWords:
        print(word)
        
        ratio = [fuzz.partial_token_set_ratio( re.sub(r'[\s"\\]', ' ', cmt).strip().upper(), word ) for cmt in data[column]]
        data['is' + word] = np.array([r > 70 for r in ratio]) * 1
    data.to_csv(filename, sep=',', encoding='utf-8')
Example #4
0
def extractFeatures(data, name):
    print("Extract features for", name)
    data["SUMMARYOPS"].fillna("NA", inplace=True)
    for word in bagOfWords:
        print(word)

        ratio = [
            fuzz.partial_token_set_ratio(re.sub(r'[\s"\\]', " ", cmt).strip().upper(), word)
            for cmt in data["SUMMARYOPS"]
        ]
        data["is" + word] = np.array([r > 70 for r in ratio]) * 1
    data.to_csv(name, sep=",", encoding="utf-8")
Example #5
0
 def curate(self):
     punc = set(string.punctuation)
     story_count = 0
     for source in self._sources:
         feed = feedparser.parse(source['url'])
         if 'entries' in feed:
             entries = feed['entries']
             for entry in entries:
                 story = {}
                 story['_source_id'] = source['id']
                 story['_source_name'] = source['name']
                 story['_source_url'] = source['url']
                 story['_path'] = self.generate_path()
                 if 'tags' in entry:
                     story['_tags'] = ''.join([tag['term'] for tag in entry['tags']])
                 else:
                     story['_tags'] = ''
                 media_content_url = ''
                 if 'media_content' in entry:
                     for media_content in entry['media_content']:
                         media_content_url = media_content['url']
                         break
                 if media_content_url == '':
                     if 'links' in entry:
                         for link in entry['links']:
                             if 'type' in link and 'href' in link and 'image' in link['type']:
                                 media_content_url = link['href']
                                 break
                 story['_media_content'] = media_content_url
                 if 'published_parsed' in entry:
                     story['_published_parsed'] = datetime.fromtimestamp(mktime(entry['published_parsed']))
                 else:
                     story['_published_parsed'] = datetime.utcnow()
                 if 'author' in entry:
                     story['_author'] = entry['author']
                 else:
                     story['_author'] = ''
                 if 'summary' in entry:
                     story['_summary'] = self.remove_markup(entry['summary'])
                     story['_fuzzy_summary'] = ''.join(sorted(
                         ['%s ' % w.upper() for w in ''.join([c for c in story['_summary'] if c not in punc]).split()
                          if len(w) > 3 and w.isalpha()]))
                 else:
                     story['_summary'] = ''
                     story['_fuzzy_summary'] = ''
                 if 'id' in entry:
                     story['_id'] = entry['id']
                 else:
                     story['_id'] = entry['link']
                 story.update(entry)
                 id = pdb.add_story(story)
                 if id is not None:
                     fuzzy_summaries = pdb.get_fuzzy_summaries(id)
                     for summary in fuzzy_summaries:
                         score = fuzz.partial_token_set_ratio(summary['fuzzy_summary'], story['_fuzzy_summary'])
                         if score >= 50:
                             topic_id = pdb.get_topic_story(summary['id'])
                             if topic_id is not None:
                                 pdb.add_topic_story(topic_id, id, score)
                             else:
                                 topic_id = pdb.add_topic(summary['s_title'])
                                 pdb.add_topic_story(topic_id, summary['id'], score)
                                 pdb.add_topic_story(topic_id, id, score)
                             break
                 story_count += 1
                 if not story_count % 50:
                     print 'story_count: %d' % story_count
     print 'story_count: %d' % story_count
data = pd.read_csv('data/quora_duplicate_questions.tsv', sep='\t')
data = data.drop(['id', 'qid1', 'qid2'], axis=1)


data['len_q1'] = data.question1.apply(lambda x: len(str(x)))
data['len_q2'] = data.question2.apply(lambda x: len(str(x)))
data['diff_len'] = data.len_q1 - data.len_q2
data['len_char_q1'] = data.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data['len_char_q2'] = data.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split()))
data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split()))
data['common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_partial_token_set_ratio'] = data.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)


model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)
data['wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)


norm_model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)
norm_model.init_sims(replace=True)
data['norm_wmd'] = data.apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1)

question1_vectors = np.zeros((data.shape[0], 300))
error_count = 0
 def fit(self, refdict=None, thresholds=[80,80], parentdict=None):
     """Construct fuzzy-key lookup dict from all records (FieldUniqueCounter dict) as {raw:common}"""
     if parentdict is not None:
         self.parentdict = parentdict 
     else:
         pd_tmp = [(pair[1],pair[0],re.sub(self.ex_lookup[self.stop_words],'',pair[0])) for pair in refdict.iteritems()]
         pd_tmp = sorted(pd_tmp,reverse=True)
         # initial condenser, set primary keys
         for trip in pd_tmp:
             if trip[0]>100:
                 self.parentdict[trip].append(trip)
             else:
                 score,group = 0,(0,'a','a')
                 for key in self.parentdict.iterkeys():
                     test = fuzz.partial_token_set_ratio(trip[2],key[2]) # matcher
                     if test>score:
                         score, group = test, key
                 if score>=thresholds[0]: # score threshold
                     self.parentdict[group].append(trip)
                 else:
                     self.parentdict[trip].append(trip)
     # score inter-key matches
     triplist = self.parentdict.keys()
     n = len(triplist)
     tripmatch = defaultdict(list)
     for i,trip in enumerate(triplist):
         score,match = 0,(0,0,0)
         for j in xrange(i+1,n):
             trip2 = triplist[j]
             test = fuzz.partial_token_set_ratio(trip[2],trip2[2]) # matcher
             if test>score:
                 score,match = test,trip2
         if match == (0,0,0):
             pass
         else:
             tripmatch[trip] += [score,match]
     # print 'tripmatch len = {}'.format(len(tripmatch))
     # self.tripmatch = tripmatch
     # secondary condenser, aggregate primary key matches
     tickdict = defaultdict(list)
     ticker = 0
     key,match = tripmatch.popitem()
     tickdict[ticker].append(key)
     while len(tripmatch)>0: # there is something problematic here i think....
         if len(match) != 2:
             key,match = tripmatch.popitem()
         if match[0]>=thresholds[0]:
             tickdict[ticker].append(match[1])
             newmatch = tripmatch[match[1]]
             del tripmatch[match[1]]
         else:
             ticker += 1
             tickdict[ticker].append(match[1])
             newmatch = tripmatch[match[1]]
             del tripmatch[match[1]]
         match = newmatch
     # print 'tickdict len = {}'.format(len(tickdict))
     # self.tickdict = tickdict
     # set common key, gather names (as tuples)
     self.finaldict = defaultdict(list)
     for vlist in tickdict.itervalues():
         common = max(vlist)
         for trip in vlist:
             self.finaldict[common] += [x for x in self.parentdict[trip]]
     # construct MAP
     for k,v in self.finaldict.iteritems():
         for trip in v:
             self.MAP[trip[1]] = k[1]
     return self
    lambda x: len(str(x).split()))  # 问句2的单词个数
data['common_words'] = data.apply(lambda x: len(
    set(str(x['question1']).lower().split()).intersection(
        set(str(x['question2']).lower().split()))),
                                  axis=1)  # 问句1与问句2的相同单词数(通过集合实现)

# 调用fuzz抽取问句对特征
data['fuzz_qratio'] = data.apply(
    lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_WRatio'] = data.apply(
    lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_partial_ratio'] = data.apply(
    lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])),
    axis=1)
data['fuzz_partial_token_set_ratio'] = data.apply(
    lambda x: fuzz.partial_token_set_ratio(str(x['question1']),
                                           str(x['question2'])),
    axis=1)
data['fuzz_partial_token_sort_ratio'] = data.apply(
    lambda x: fuzz.partial_token_sort_ratio(str(x['question1']),
                                            str(x['question2'])),
    axis=1)
data['fuzz_token_set_ratio'] = data.apply(
    lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])),
    axis=1)
data['fuzz_token_sort_ratio'] = data.apply(
    lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])),
    axis=1)

model = gensim.models.KeyedVectors.load_word2vec_format(
    'data/GoogleNews-vectors-negative300.bin.gz', binary=True)
data['wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)
Example #9
0
        # Search in CPE
        #if any(w in k.lower() for w in terms):
        #    partial_results1_append((k, x, False))

        # There is an acronym?
        if any(acronym.search(x) is not None for acronym in acronyms):
            partial_results1_append((k, x, True))

    # Apply token_set_ratio
    partial_results2 = {}

    # k = CPE (str)
    # x = CPE description (str)
    # is_acronym = Bool
    for k, x, is_acronym in partial_results1:
        r = fuzz.partial_token_set_ratio(search_term, x, force_ascii=True)

        # Is false positive?
        if any(fil.search(x) is not None for fil in filters):
            continue

        # More weight if there is an acronym
        if is_acronym:
            r *= 1.25
            # Fix Valuer
            r = r if r <= 100 else 100

        partial_results2[k] = int(r)

        if results_number == 1 and r == 100:
            break
def fuzz_partial_token_set_ratio(sentences):
    sen = sentences.split("\001")
    return fuzz.partial_token_set_ratio(sen[0], sen[1])
df_data = pd.concat([
    df_train[['question1', 'question2']], df_test[['question1', 'question2']]
],
                    axis=0)

df_feat['fuzz_qratio'] = df_data.apply(
    lambda row: fuzz.QRatio(str(row['question1']), str(row['question2'])),
    axis=1)
df_feat['fuzz_WRatio'] = df_data.apply(
    lambda row: fuzz.WRatio(str(row['question1']), str(row['question2'])),
    axis=1)
df_feat['fuzz_partial_ratio'] = df_data.apply(lambda row: fuzz.partial_ratio(
    str(row['question1']), str(row['question2'])),
                                              axis=1)
df_feat['fuzz_partial_token_set_ratio'] = df_data.apply(
    lambda row: fuzz.partial_token_set_ratio(str(row['question1']),
                                             str(row['question2'])),
    axis=1)
df_feat['fuzz_partial_token_sort_ratio'] = df_data.apply(
    lambda row: fuzz.partial_token_sort_ratio(str(row['question1']),
                                              str(row['question2'])),
    axis=1)
df_feat['fuzz_token_set_ratio'] = df_data.apply(
    lambda row: fuzz.token_set_ratio(str(row['question1']),
                                     str(row['question2'])),
    axis=1)
df_feat['fuzz_token_sort_ratio'] = df_data.apply(
    lambda row: fuzz.token_sort_ratio(str(row['question1']),
                                      str(row['question2'])),
    axis=1)

df_feat[:len_train].to_csv('train_feature_fuzz.csv', index=False)
Example #12
0
# install fuzzywuzzy

# 2nd set of features
from fuzzywuzzy import fuzz

data['qratio'] = data.apply(
    lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
data['partial_ratio'] = data.apply(
    lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])),
    axis=1)
data['ratio'] = data.apply(
    lambda x: fuzz.ratio(str(x['question1']), str(x['question2'])), axis=1)
data['token_sort_ratio'] = data.apply(lambda x: fuzz.partial_token_sort_ratio(
    str(x['question1']), str(x['question2'])),
                                      axis=1)
data['token_set_ratio'] = data.apply(lambda x: fuzz.partial_token_set_ratio(
    str(x['question1']), str(x['question2'])),
                                     axis=1)

data.head(3)

data.to_csv('C:\\Users\\Nitin PC\\Downloads\\preprocessed.tsv',
            sep='\t',
            encoding='utf-8')

data = pd.read_csv('C:\\Users\\Nitin PC\\Downloads\\preprocessed.tsv',
                   sep='\t')
# more cleaning
# stop words

stop_words = stopwords.words('english')
diff_len = len_1 - len_2
len_char_q1 = len(''.join(set(str(question1).replace(' ', ''))))
len_char_q2 = len(''.join(set(str(question2).replace(' ', ''))))
len_word_q1 = len(str(question1).split())
len_word_q2 = len(str(question2).split())
common_words = len(
    set(str(question1).lower().split()).intersection(
        set(str(question2).lower().split())))

#fuzzy
from fuzzywuzzy import fuzz

fuzz_qratio = fuzz.QRatio(str(question1), str(question2))
fuzz_WRatio = fuzz.WRatio(str(question1), str(question2))
fuzz_partial_ratio = fuzz.partial_ratio(str(question1), str(question2))
fuzz_partial_token_set_ratio = fuzz.partial_token_set_ratio(
    str(question1), str(question2))
fuzz_partial_token_sort_ratio = fuzz.partial_token_sort_ratio(
    str(question1), str(question2))
fuzz_token_set_ratio = fuzz.token_set_ratio(str(question1), str(question2))
fuzz_token_sort_ratio = fuzz.token_sort_ratio(str(question1), str(question2))

#wmd
import gensim

model = gensim.models.KeyedVectors.load_word2vec_format(
    'GoogleNews-vectors-negative300.bin.gz', binary=True)
#sen2vec
import scipy

question1_vectors = scipy.sparse.lil_matrix((dataset.shape[0], 300))
question2_vectors = scipy.sparse.lil_matrix((dataset.shape[0], 300))
Example #14
0
    def parse_authors(self, register):
        """
        Transforms the raw register author information from web of science in the CoLav standard.

        Parameters
        ----------
        register : dict
           Register in web of science format
        
        Returns
        -------
        authors : list
            Information of the authors in the CoLav standard format
        """
        authors = []
        if "PT" in register.keys():
            #if register["PT"].rstrip()=="J":
            corresponding_last_name = ""
            orcid_list = []
            researchid_list = []
            if "RI" in register.keys():
                if register["RI"] and register["RI"] == register["RI"]:
                    ri = register["RI"]
                    if ri[-1] == "\n":
                        ri = ri[:-1]
                    researchid_list = ri.rstrip().replace("; ", ";").split(";")
            if "OI" in register.keys():
                if register["OI"] and register["OI"] == register["OI"]:
                    oi = register["OI"]
                    if oi[-1] == "\n":
                        oi = oi[:-1]
                    orcid_list = oi.rstrip().replace("; ", ";").split(";")
            if "AF" in register.keys():
                author_list = register["AF"].rstrip().split("\n")
                if "RP" in register.keys():
                    if register["RP"]:
                        corresponding_last_name = register["RP"].split(",")[0]
                for au in author_list:
                    entry = {}
                    entry["first_names"] = ""
                    entry["national_id"] = ""
                    entry["last_names"] = ""
                    entry["initials"] = ""
                    entry["full_name"] = ""
                    entry["aliases"] = []
                    entry["affiliations"] = []
                    entry["keywords"] = []
                    entry["external_ids"] = []
                    entry["corresponding"] = False
                    entry["corresponding_address"] = ""
                    entry["corresponding_email"] = ""
                    raw_name = au.split(", ")
                    if len(raw_name) == 1:
                        names = raw_name[0].capitalize()
                        last_names = ""
                    elif len(raw_name) > 2:
                        names = " ".join(raw_name[:-1]).rstrip().capitalize()
                        last_names = raw_name[-1].capitalize()
                    else:
                        names = raw_name[1].capitalize()
                        last_names = raw_name[0].capitalize()

                    entry["full_name"] = names + " " + last_names
                    entry["first_names"] = names
                    entry["last_names"] = last_names
                    entry["initials"] = "".join(
                        [i[0].upper() for i in names.split(" ")])
                    #Checking if there is an external id
                    entry_ext = []
                    for res in researchid_list:
                        if not res:
                            continue
                        try:
                            name, rid = res.split("/")[-2:]
                        except Exception as e:
                            print(
                                "Could not split name and id in researchid field on ",
                                register["doi_idx"])
                            print(e)
                        ratio = fuzz.partial_ratio(name,
                                                   last_names + ", " + names)
                        if ratio > 90:
                            entry_ext.append({
                                "source": "researchid",
                                "value": rid
                            })
                            break
                        elif ratio > 50:
                            ratio = fuzz.token_set_ratio(
                                name, last_names + ", " + names)
                            if ratio > 90:
                                entry_ext.append({
                                    "source": "researchid",
                                    "value": rid
                                })
                                break
                            elif ratio > 50:
                                ratio = fuzz.partial_token_set_ratio(
                                    name, last_names + ", " + names)
                                if ratio > 95:
                                    entry_ext.append({
                                        "source": "researchid",
                                        "value": rid
                                    })
                                    break
                    for res in orcid_list:
                        if not res:
                            continue
                        try:
                            name, oid = res.split("/")[-2:]
                        except Exception as e:
                            print(
                                "Could not split name and id in orcid field on ",
                                register["doi_idx"])
                            print(e)
                        ratio = fuzz.partial_ratio(name,
                                                   last_names + ", " + names)
                        if ratio > 90:
                            entry_ext.append({"source": "orcid", "value": oid})
                            break
                        elif ratio > 50:
                            ratio = fuzz.token_set_ratio(
                                name, last_names + ", " + names)
                            if ratio > 90:
                                entry_ext.append({
                                    "source": "orcid",
                                    "value": oid
                                })
                                break
                            elif ratio > 50:
                                ratio = fuzz.partial_token_set_ratio(
                                    name, last_names + ", " + names)
                                if ratio > 95:
                                    entry_ext.append({
                                        "source": "orcid",
                                        "value": oid
                                    })
                                    break
                    entry["external_ids"] = entry_ext
                    #Checking if is corresponding author
                    if corresponding_last_name:
                        if corresponding_last_name in last_names:
                            entry["corresponding"] = True
                            if "EM" in register.keys():
                                if register["EM"] and register[
                                        "EM"] == register["EM"]:
                                    entry["corresponding_email"] = register[
                                        "EM"].rstrip()
                    authors.append(entry)
                if len(authors) == 1:
                    authors[0]["corresponding"] = True
        return authors
Example #15
0
    def faxFromList(self, args, isPM):
        '''Look up the monster code in the list using fuzzy matching, 
        then fax it. (Or, if in quiet mode, display its code)
        '''
        splitArgs = args.split()
        if any(s for s in splitArgs if s.strip().lower() == "force"):
            return self.fax(splitArgs[0], splitArgs[0], "(forcing) ", 
                            isPM, force=True)
        
        # first, check for exact code/name/alias matches
        matches = [entry.code for entry in self._faxList.values() 
                   if entry.contains(args)]
        if len(matches) == 1:
            return self.fax(matches[0], self._faxList[matches[0]].name, "", 
                            isPM)
        
        # next, check for "close" matches
        simplify = (lambda x: x.replace("'", "")
                              .replace("_", " ")
                              .replace("-", " ").lower())
        sArgs = simplify(args)
        scoreDiff = 15
        scores = defaultdict(list)
        
        # make list of all possible names/codes/aliases
        allNames = [name for entry in self._faxList.values() 
                    for name in entry.nameList] 
        for s in allNames:
            score1 = fuzz.partial_token_set_ratio(simplify(s), sArgs)
            scores[score1].append(s)
        allScores = scores.keys()
        maxScore = max(allScores)
        for score in allScores:
            if score < maxScore - scoreDiff:
                del scores[score]
        matches = []
        for match in scores.values():
            matches.extend(match)
        fuzzyMatchKeys = set(entry.code for entry in self._faxList.values() 
                             for match in matches if entry.contains(match))
        

        # also check for args as a subset of string or code
        detokenize = lambda x: ''.join(re.split(r"'|_|-| ", x)).lower()
        dArgs = detokenize(args)
        matches = [name for name in allNames if dArgs in detokenize(name)]
        subsetMatchKeys = set(entry.code for entry in self._faxList.values() 
                              for match in matches if entry.contains(match))
        
        ls = len(subsetMatchKeys)
        lf = len(fuzzyMatchKeys)
        matchKeys = subsetMatchKeys | fuzzyMatchKeys
        lm = len(matchKeys)
        
        if ls == 0 and lf == 1:
            m = matchKeys.pop()
            return self.fax(m, self._faxList[m].name, "(fuzzy match) ", isPM)
        elif lm == 1:
            m = matchKeys.pop()
            return self.fax(m, self._faxList[m].name, "(subset match) ", isPM)
        elif lm > 1 and lm < 6:
            possibleMatchStr = ", ".join(
                    ("{} ({})".format(self._faxList[k].name,k))
                     for k in matchKeys)
            return "Did you mean one of: {}?".format(possibleMatchStr)
                    
        elif lm > 1:
            return ("Matched {} monster names/codes; please be more specific."
                    .format(ls + lf))

        return ("No known monster with name/code matching '{0}'. "
                "Use '!fax {0} force' to force, or check the monster list "
                "at {1} .".format(args, self.fax_list_url))
Example #16
0
 def testPartialTokenSetRatio(self):
     self.assertEqual(fuzz.partial_token_set_ratio(self.s4, self.s7), 100)
Example #17
0
def fuzz_partial_token_setratio(q1, q2):
    return fuzz.partial_token_set_ratio(q1, q2)