def deserialize(self, type, name, language='en'): serializer = self.serializers[type] if type != "lda_model": with codecs.open(name, "r", encoding = "utf-8") as f: data = json.load(f) elif type == "lda_model": with open(name, "rb") as f: data = pickle.load(f) deserialized = serializer(data).deserialize() if type == "phrases": if language == 'en': common_terms = self.function_words_single else: common_terms = safe_get_stop_words(language) phrases = Phrases(delimiter="_", connector_words=common_terms) phrases.phrasegrams = deserialized deserialized = phrases return deserialized
def get_association(self, df, min_count=1, threshold=0.70, save_phraser=False, language='en'): cxg = C2xG(language = self.settings.MAP_THREE[language]) association_df = cxg.get_association(self.read(df), freq_threshold = min_count, smoothing = False, lex_only = True) if save_phraser == True: if language == 'en': common_terms = self.function_words_single else: common_terms = safe_get_stop_words(language) phrasegrams = {} for row in association_df.itertuples(): word = row[1] + "_" + row[2] if row[3] > threshold: phrasegrams[word] = row[3] phrases = Phrases(delimiter="_", connector_words=common_terms, min_count=min_count, threshold=threshold) phrases.phrasegrams = phrasegrams self.phrases = phrases return association_df