def similaritypolymain(pTrainData, pTestData, pLevel1, pLevel2, pDesc, pFromDir, pToDir, Nbest):
    try:
        pTrainData = pTrainData[pTrainData[pDesc].notna()]
        pTestData = pTestData[pTestData[pDesc].notna()]
        pTestData['Intent'], pTestData['Confidence_Level'] = 'Nan','Nan'
        pTrainData, __ = traindata(pTrainData, pDesc, pLevel1, pLevel2, pFromDir, pToDir)
        pTrainDataDesc = pd.DataFrame(pTrainData[pDesc])
        pTrainDataDescUnq = pTrainDataDesc[pDesc].unique().tolist()
        pTestDataDescList = list(pTestData[pDesc].values) #need to convert back to a list
        model = PolyFuzz("TF-IDF")
        model.match(pTestDataDescList, pTrainDataDescUnq, nbest = int(Nbest))
        pMatchesDf = model.get_matches()

        IntCol = ["To"]
        for i in range(1, int(Nbest)-1):
            IntCol.append("BestMatch" + "__" + str(i))
            pTestData['Intent' + '__' + str(i)] = 'NaN'

        SimCol = ['Similarity']
        for k in range(1, int(Nbest) - 1):
            SimCol.append("Similarity" + "__" + str(k))
            pTestData['Confidence_Level'+ '__' + str(k)] = 'NaN'
            
        for i in range(len(IntCol)):
            col = str(IntCol[i])
            if col != "To":
                for j in range(len(pTestData)):
                    if pMatchesDf[col][j] != None:
                        pTestData['Intent' + '__' + str(i-1)][j] = pTrainData[np.where(pTrainData[pDesc] == pMatchesDf[col][j], True , False)]['Intent'].values[0]
            else:
                for j in range(len(pTestData)):
                    if pMatchesDf[col][j] != None:
                        pTestData['Intent'][j] = pTrainData[np.where(pTrainData[pDesc] == pMatchesDf[IntCol[i]][j], True , False)]['Intent'].values[0]                 
                    
            
        for l in range(len(SimCol)):
            col = str(SimCol[l])
            if col != "Similarity":
                for m in range(len(pTestData)):
                    if pMatchesDf[col][m] != None:
                        pTestData['Confidence_Level'+ '__' + int(l-1)][m] = pMatchesDf[SimCol[l]][m]
            else:
                for m in range(len(pTestData)):
                    if pMatchesDf[SimCol[l]][m] != None:
                        pTestData['Confidence_Level'][m] = pMatchesDf[SimCol[l]][m]
            
    except Exception as e:
        print('*** ERROR[004]: Error in similarity poly main function: ', sys.exc_info()[0],str(e))
        print(traceback.format_exc())
        utils.movefile(pFromDir, pToDir)
        return(-1)
        sys.exit(-1)
    return(0, pTestData)    
Ejemplo n.º 2
0
def similaritypolymain(pTrainData, pTestData, pAsg, pDesc, pDate, Nbest):
    try:
        pTrainData = pTrainData[pTrainData[pDesc].notna()]
        pTestData = pTestData[pTestData[pDesc].notna()]
        pTestData['Assignee_Group_Pred'], pTestData['Confidence_Level'] = 'Nan', float(0.0)
        pTrainDataDesc = pd.DataFrame(pTrainData[pDesc])
        pFeaList = []
        pFeaList = pTrainData['Features'].tolist() + pTestData['Features'].tolist()
        pFeaUnqList = list(set(pFeaList))   
        pMatchData, pData, pTestAppendDf,  = [], [], []
        pMatchesDf, pTestMatchData, pTestDf = pd.DataFrame(),pd.DataFrame(), pd.DataFrame()
        for i in range(len(pFeaUnqList)):
            ToData, FromData = pd.DataFrame(), pd.DataFrame()
            FromData = pTrainData.loc[pTrainData['Features'] == pFeaUnqList[i]]
            ToData = pTestData.loc[pTestData['Features'] == pFeaUnqList[i]]
            model = PolyFuzz("TF-IDF")
            pTestAppendDf.append(ToData)
            if len(ToData[pDesc].tolist()) and len(FromData[pDesc].tolist()) >= 1:
                model.match(list(ToData[pDesc].values), FromData[pDesc].unique().tolist(), nbest = int(Nbest))
                Matches = model.get_matches()
                pMatchData.append(Matches)
                pData.append(ToData)              
            
        pMatchesDf = pd.concat(pMatchData)
        pTestMatchData = pd.concat(pData) 
        pTestDf = pd.concat(pTestAppendDf)
        pMatchesDf.reset_index(inplace=True)
        del pMatchesDf['index']
        pTestMatchData.reset_index(inplace=True)
        del pTestMatchData['index']    
        pTestDf.reset_index(inplace=True)
        del pTestDf['index']        
        
        pTestConcatData = pd.concat([pTestMatchData,pMatchesDf], axis = 1)
        
        IntCol = ["To"]
        for i in range(1, int(Nbest)-1):
            IntCol.append("BestMatch" + "__" + str(i))
            pTestMatchData['Assignee_Group_Pred' + '__' + str(i)] = 'NaN'

        SimCol = ['Similarity']
        for k in range(1, int(Nbest) - 1):
            SimCol.append("Similarity" + "__" + str(k))
            pTestMatchData['Confidence_Level'+ '__' + str(k)] = 'NaN'
            
        for i in range(len(IntCol)):
            col = str(IntCol[i])
            if col != "To":
                pTestAppendFea = []
                for p in range(len(pFeaUnqList)):
                    pTrainFeaData, pTestFeaData = pd.DataFrame(), pd.DataFrame()
                    pTrainFeaData = pTrainData.loc[pTrainData['Features'] == pFeaUnqList[p]]
                    pTestFeaData = pTestConcatData.loc[pTestConcatData['Features'] == pFeaUnqList[p]]
                    pTestFeaData.reset_index(inplace=True)
                    del pTestFeaData['index'] 
                    if len(pTestFeaData) and len(pTrainFeaData)> 0: 
                        for j in range(len(pTestFeaData)):
                            if pMatchesDf[col][j] != None:
                                if len(pTrainFeaData[np.where(pTrainFeaData[pDesc] == pTestFeaData[IntCol[i]][j], True , False)][pAsg].values) != 0:
                                    pTestFeaData['Assignee_Group_Pred' + '__' + str(i-1)][j] = pTrainFeaData[np.where(pTrainFeaData[pDesc] == pTestFeaData[col][j], True , False)][pAsg].values[0]
            else:
                pTestAppendFea = []
                for p in range(len(pFeaUnqList)):
                    pTrainFeaData, pTestFeaData = pd.DataFrame(), pd.DataFrame()
                    pTrainFeaData = pTrainData.loc[pTrainData['Features'] == pFeaUnqList[p]]
                    pTestFeaData = pTestConcatData.loc[pTestConcatData['Features'] == pFeaUnqList[p]]
                    pTestFeaData.reset_index(inplace=True)
                    del pTestFeaData['index'] 
                    if len(pTestFeaData) and len(pTrainFeaData)> 0: 
                        for j in range(len(pTestFeaData)):
                            if pTestFeaData[col][j] != None:
                                if len(pTrainFeaData[np.where(pTrainFeaData[pDesc] == pTestFeaData[IntCol[i]][j], True , False)][pAsg].values) != 0:
                                    pTestFeaData['Assignee_Group_Pred'][j] = pTrainFeaData[np.where(pTrainFeaData[pDesc] == pTestFeaData[IntCol[i]][j], True , False)][pAsg].values[0]  
                                else:
                                    pTestFeaData['Assignee_Group_Pred'][j] =  None
                        pTestAppendFea.append(pTestFeaData)
        pTestFeaDf = pd.concat(pTestAppendFea)  
        pTestFeaDf.reset_index(inplace=True)
        del pTestFeaDf['index'] 
        
        pTestDf.loc[pTestDf['Number'].isin(pTestFeaDf['Number']), ['Confidence_Level', 'Assignee_Group_Pred']] = pTestFeaDf[['Similarity', 'Assignee_Group_Pred']].values
        
    except Exception as e:
        print('*** ERROR[004]: Error in similarity poly main function: ', sys.exc_info()[0],str(e))
        print(traceback.format_exc())
        return(-1)
        sys.exit(-1)
    return(0, pTestDf)    
Ejemplo n.º 3
0
class Matcher:

    def __init__(self):

        # Load PolyFuzz model for matching. Default: TF-IDF
        self.model = PolyFuzz(config.MODEL_MATCHING)

        # Load the filters
        self.filters: Dict[str, List[Filter]] = self.__load_filters()

    @staticmethod
    def __load_filters() -> dict:
        """
        Load the filters from filters.toml (by default), create Filter
        objects, and return a dictionary of these object classified by
        intent.
        """
        filters = {}

        # Load the raw filter
        toml_file = toml.load(config.FILTERS_TOML, _dict=dict)

        # Loop over each intent
        for intent, raw_filters in toml_file.items():
            filter_list = []

            # Loop over each filter in this intent
            for name, content in raw_filters.items():

                # Create and append a Filter object
                filter_list.append(
                    Filter(
                        name=name,
                        words=content['words'],
                        regex=content['regex'],
                        threshold=content['threshold']
                    )
                )

            # Save the filters to the main dictionary
            filters[intent] = filter_list

        return filters

    def get_keywords(self, text: str, intent: str) -> dict:

        keywords = {}
        if intent in self.filters:

            # Split the text into a list of words
            entries = text.split(" ")

            for filter_ in self.filters[intent]:

                # Math similarities between the filter and the given text
                self.model.match(entries, filter_.words)
                matches: pd.DataFrame = self.model.get_matches()

                try:
                    # Get the word with the maximum similarity
                    thresholds = matches[matches['Similarity'] >= filter_.threshold]
                    keyword = thresholds[thresholds['Similarity'] == thresholds['Similarity'].max()].iloc[0, 0]

                except Exception:
                    # If there's no match, set the filter as None
                    keywords[filter_.name] = None

                else:
                    # Use the keyword to retrieve and save its chained-data
                    if result := re.search(filter_.regex % keyword, text):
                        keywords[filter_.name] = result.group(filter_.name)

                    else:
                        keywords[filter_.name] = None
Ejemplo n.º 4
0
    # gif_runner = st.image(gif_path)

    c1, c2, c3 = st.beta_columns([5, 5, 5])

    with c2:

        # gif_runner = st.image("mouse.gif")
        # gif_runner = st.image(gif_path)
        gif_runner = st.image("mouse.gif")

        import time

        time.sleep(2)

        model.match(linesList, col_one_list)
        # model.match(linesDeduped2, col_one_list)
        # Auto map by Similarity scores
        Polyfuzz = model.get_matches()
        # Polyfuzz
        gif_runner.empty()
    # st.stop()

    # model.match(linesList, col_one_list)
    ## Auto map by Similarity scores
    # Polyfuzz = model.get_matches()

    # Polyfuzz
    # st.stop()

    if (RadioMapTo == "To crawled titles") and (RadioMapWhat == "Map Broken URLs"):
Ejemplo n.º 5
0
    print("\nRemove repeated retweets... >80% fast text similarity")
    fasttext_embeddings = WordEmbeddings('en-crawl')
    fasttext = Embeddings(fasttext_embeddings,
                          min_similarity=0,
                          model_id="FastText")
    model = PolyFuzz(fasttext)

    start = time.time()
    indexes_to_remove = []
    for topic in relevant_tweetir["topic"].unique():
        topic_tweets = relevant_tweetir.loc[relevant_tweetir["topic"] == topic,
                                            "tweets.full_text"]
        for index, tweet in topic_tweets.items():
            indexes = topic_tweets.index[topic_tweets.index != index]
            for ind in indexes:
                model.match(tweet.split(), topic_tweets.loc[ind].split())
                mean_sim = round(model.get_matches()["Similarity"].mean(), 2)
                if mean_sim > 0.8:
                    indexes_to_remove.append(ind)
                    break
    relevant_tweetir = relevant_tweetir[~relevant_tweetir.index.
                                        isin(indexes_to_remove)]
    end = time.time()

    print(indexes_to_remove)
    print(f"Computation time - {round(end - start, 2)} seconds")

    relevant_tweetir["tweets.full_text"] = relevant_tweetir[
        "tweets.full_text"].drop_duplicates()
    relevant_tweetir.dropna(subset=["tweets.full_text"], inplace=True)
    relevant_tweetir = remove_irrelevant_topics(relevant_tweetir,
Ejemplo n.º 6
0

if RadioMapAgainst == "all crawled URLs":
    GSCDf = dfIndexable
else:
    pass


##########################################################

model = PolyFuzz("EditDistance")

start_execution = c30.button(" Run model! ✨ ")


model.match(linesDeduped2, col_one_list)
# Auto map by Similarity scores
Polyfuzz = model.get_matches()

Polyfuzz

st.stop()

#start_execution = c30.button(" 🚀✨Run model! ")
if start_execution:
 
    


    cm = sns.light_palette("red", as_cmap=True, reverse=True)
    FuzzStyled = Polyfuzz.style.background_gradient(cmap=cm)
Ejemplo n.º 7
0
from polyfuzz import PolyFuzz

#import polyfuzz


from_list = ["https://www.tatielou.co.uk/apples/sadasda", "https://www.tatielou.co.uk/oranges/sadasda"]
to_list = ["https://www.tatielou.co.uk/apples/", "https://www.tatielou.co.uk/oranges/", "https://www.tatielou.co.uk/pears/"]
from_list = ["apple", "apples", "appl", "recal", "house", "similarity"]
to_list = ["apple", "apples", "mouse"]
model = PolyFuzz("EditDistance")
model.match(from_list, to_list)
# Auto map by Similarity scores
model.get_matches()