class DataHandler: TOP_TERMS_PER_CLUSTER = config.get_env("DEFAULT_TOP_TERMS_PER_CLUSTER") HAS_MULTIPLE_DATA_SOURCES = False DATA_SOURCE = None SHUFFLE_DATA = True PRE_LOAD_UUID = None def __init__(self, name): self.name = name log.info(f'{name} Data Loaded') self.df = None self.saved_item_to_cluster = None def display_labels(self): pass def meta_info(self): return [{"content": ''}] * self.df.shape[0] def item_to_cluster(self): return self.saved_item_to_cluster def clean_up_df_text(self, col, language="english", clean_up_method="nltk"): return clean_up_text(self.df, col, language, clean_up_method) def calculate_n_clusters(self): return calculate_n_clusters_by_category(self.df.shape[0])['medium'][1]
def clean_up_text(df, column_name, language, clean_up_method): log.info('Starting Text Cleanup') should_use_single_processing = config.get_env("PROCESSES_NUMBER") < 2 log.info(f'Using {clean_up_method}. Language={language}') if clean_up_method == "nltk": class_to_use = NltkTextCleaner(language) elif clean_up_method == "spacy": if language not in ['english', 'german']: log.warn(f'SpaCy does not support {language}') return class_to_use = SpacyTextCleaner(language) else: log.warn(f'{clean_up_method} not found') return if should_use_single_processing: return df[column_name].apply( lambda x: class_to_use.tokenizer(x)).tolist() start_time = time.time() with mp.Pool() as pool: result = pool.map(class_to_use.tokenizer, df[column_name]) log.info("Finished Text Clean up after %s seconds" % (time.time() - start_time)) return result
def calculate(self, stopwords=None): self.vectorizer = TfidfVectorizer(stop_words=stopwords, max_df=0.8, tokenizer=identity_func(stopwords), lowercase=False) self.tfidf_matrix = self.vectorizer.fit_transform(self.documents) self.model = KMeans(n_clusters=self.n_clusters, init='k-means++', max_iter=self.max_iteration, n_init=1, n_jobs=config.get_env("PROCESSES_NUMBER")) self.model.fit(self.tfidf_matrix)
def __init__(self, documents, n_clusters, top_terms_per_cluster): self.documents = documents self.top_terms_per_cluster = top_terms_per_cluster self.vectorizer = CountVectorizer(max_df=0.8, min_df=2, stop_words=None, tokenizer=identity_func(None), lowercase=False) self.tfidf = self.vectorizer.fit_transform(documents) self.features = self.vectorizer.get_feature_names() self.clf = LatentDirichletAllocation( n_components=n_clusters, n_jobs=config.get_env("PROCESSES_NUMBER"), random_state=0).fit(self.tfidf) self.topics()
def configure_app(app): app.config.from_object(get_config()) app.config.from_pyfile(get_env()) return app
"origins": [ "http://localhost:8080", "https://pandermatt.ch", "https://kenspace.ch" ] }, }) authorizations = { 'Bearer Auth': { 'type': 'apiKey', 'in': 'header', 'name': 'Authorization' }, } swagger_ui_enabled = '/' if config.get_env('PRODUCTION') == 'Y': swagger_ui_enabled = False api = Api(app, version='0.1.0', title='KenSpace API', description='API for KenSpace', security='Bearer Auth', authorizations=authorizations, doc=swagger_ui_enabled) queries = api.namespace('queries', description='Query operations') auth = api.namespace('auth', description='Authentication') feedback = api.namespace('feedback', description='Submit Feedback') upload = api.namespace('upload', description='Upload Data')
def verify_token(token): try: return token in config.get_env('AUTH_KEY') except RuntimeError: token_auth_error()