コード例 #1
0
    def deserialize(self, type, name, language='en'):
    
        serializer = self.serializers[type]
        
        if type != "lda_model":
            with codecs.open(name, "r", encoding = "utf-8") as f:
                data = json.load(f)
         
        elif type == "lda_model":
            with open(name, "rb") as f:
                data = pickle.load(f)
            
        deserialized = serializer(data).deserialize()
        
        if type == "phrases":
            if language == 'en':
                common_terms = self.function_words_single
            else:
                common_terms = safe_get_stop_words(language)

            phrases = Phrases(delimiter="_", connector_words=common_terms)
            phrases.phrasegrams = deserialized
            deserialized = phrases        
        
        return deserialized
コード例 #2
0
    def get_association(self, df, min_count=1, threshold=0.70, save_phraser=False, language='en'):
    
        cxg = C2xG(language = self.settings.MAP_THREE[language])
        association_df = cxg.get_association(self.read(df), freq_threshold = min_count, smoothing = False, lex_only = True)
        
        if save_phraser == True:
            if language == 'en':
                common_terms = self.function_words_single
            else:
                common_terms = safe_get_stop_words(language)

            phrasegrams = {}
            for row in association_df.itertuples():
                word = row[1] + "_" + row[2]
                if row[3] > threshold:
                    phrasegrams[word] = row[3]
        
            phrases = Phrases(delimiter="_", connector_words=common_terms, min_count=min_count, threshold=threshold)
            phrases.phrasegrams = phrasegrams
            self.phrases = phrases
            
        return association_df