def generate_TFIDF_table(): """ Generates TF-IDF (Term Frequency - Inverse Document Frequency) table of the messages. Returns ------- tfidf : pd.DataFrame TF-IDF table. """ tfidf = pd.DataFrame() tfidf['_id'] = [] init_db() for msg in CorrectedMessage.objects(): text = base64.urlsafe_b64decode(msg.bodyBase64Plain.encode()).decode() tfidf = tfidf.append(tf_vector(text, tfidf.columns, tfidf, msg.msg_id, msg.corrections), ignore_index=True) num_msg = len(tfidf) for word in tfidf.drop(columns='_id').columns: idf = log(num_msg / (len(tfidf[tfidf[word] > 0]) + 1)) tfidf[word] = tfidf[word].apply(lambda tf: tf * idf) return tfidf
def init_db_and_get_data(self): init_db() account = Account.objects.get(account_name='chris_whu') user = User.objects.get(account=account, user_name='xrb') device = Device.objects.filter(user=user)[0] return account, user, device
def connect_db(): """Connects to the specific database.""" if not os.path.exists(app.config['DATABASE']): initdb.init_db(app.config['DATABASE'], 'schema.sql') rv = sqlite3.connect(app.config['DATABASE']) rv.row_factory = sqlite3.Row return rv
def setUpClass(cls): config = init_db() global client_app wrapper = FlaskAppWrapper(providers=[AIDAIndex()], external_config=config) wrapper.create_app() client_app = wrapper.test_app()
def main(): init_db() df = pd.DataFrame([ without_keys(m, cf.UNUSED_FIELDS) for m in json.loads(Metrics.objects().to_json()) ]) df.set_index('_id') transform_recipients_columns(df) # df.to_csv('dataframe.csv', index = False) # df = pd.read_csv('dataframe.csv') describe_dataframe(df, 'general_description') for t in RelationshipType: describe_dataframe(df[df.relationship.eq(t.name)], t.name + '_description') normalized = df.drop(columns=['_id', 'relationship']) cols = list(normalized.columns.values) for c in cols: normalized[c] = (df[c] - df[c].min()) / (df[c].max() - df[c].min()) # normalized.to_csv('normalized.csv', index = False) # normalized = pd.read_csv('normalized.csv') study_kmeans_silhouette_score('norm_', normalized) study_dbscan_silhouette_score('norm_', normalized, df['relationship']) get_scatter_matrix('norm_', normalized, df['relationship']) study_kmeans_silhouette_score('data_', df.drop(columns=['_id', 'relationship'])) study_dbscan_silhouette_score('data_', df.drop(columns=['_id', 'relationship']), df['relationship']) get_scatter_matrix('data_', df.drop(columns=['_id', 'relationship']), df['relationship']) pca_analysis(normalized, df['relationship'], 'norm_') pca_analysis(df.drop(columns=['_id', 'relationship']), df['relationship'], 'data_') for c in ['gini', 'entropy']: classify_with_decission_tree(df.drop(columns=['_id', 'relationship']), df['relationship'], c, 'data_') classify_with_decission_tree(normalized, df['relationship'], c, 'norm_')
def analyse(self, nextPageToken=None, sign=None): """ Analyses all the messages of the given user. Parameters ---------- nextPageToken: str, optional Token of the next page for extracting messages. The default is None. sign: str, optional Signature of the user in his emails. The default value is None. Returns ------- None. """ init_db() SessionTypoError.drop_collection() # PreprocessedMessage.drop_collection() # CorrectedMessage.drop_collection() # Metrics.drop_collection() self.__quota = self.__extractor.extract_sent_msg( self.__nres, nextPageToken) for ext in ExtractedMessage.objects(): self.__preprocess_message(ext.to_json(), sign) for prep in PreprocessedMessage.objects(): self.__correct_message(prep.to_json()) for cor in CorrectedMessage.objects(): self.__measure_style(cor.to_json()) with open('log.txt', 'a') as f: f.write('\nANALYSIS FINISHED:\n') f.write( f'{ExtractedMessage.objects().count()} preprocessed messages.\n' ) f.write( f'{PreprocessedMessage.objects().count()} typo-corrected messages.\n' ) f.write( f'{CorrectedMessage.objects().count()} measured messages.\n')
def update_realtime(symbols: Union[str, List[str]]): DatabaseUtils.save_realtime(GetStockData.get_real_time_price(symbols)) def update_daily(symbols: Union[str, List[str]]): DatabaseUtils.save_daily_history(GetStockData.get_daily_data(symbols)) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('symbols', nargs='+') args = parser.parse_args() print(args.symbols) init_db(args.symbols) scheduler = BlockingScheduler() scheduler.add_job(update_realtime, 'interval', seconds=3, args=[ args.symbols, ]) scheduler.add_job(update_daily, 'interval', minutes=1, args=[ args.symbols, ]) try:
from flask import Flask, request, jsonify from stylemeasuring.stylemeter import StyleMeter from initdb import init_db from confanalyser import NLP os.chdir(initial_dir) app = Flask(__name__) stylemeter = StyleMeter(NLP) @app.route('/stylemeter', methods=['POST']) def measure_style(): """ Measures the writting style of the given message. Returns ------- str: Message which confirms that it was successfully measured. """ msg = request.json['message'] msg['id'] = msg['_id'] del msg['_id'] return jsonify({'id' : stylemeter.measure_style(msg)}) if __name__ == '__main__': init_db() app.run(debug=True, port = 6000)
def setUpClass(cls): cls.config = init_db()
def main(): init_db() for m in Metrics.objects(): for l in [m.to, m.cc, m.bcc]: for d in l: classify_contact(d)