class ConnectionEstablisher(object): # Defining the local environment variables # Needed for API.AI and Mongo Client MONGODB_ACCESS = "DB_ACCESS" APICLIENT_ACCESS_TOKEN = "CLIENT_ACCESS_TOKEN" LANGUAGE = "en" DB_NAME = 'heroku_swknz2mg' TABLE_NAME = 'userinfo' def __init__(self): self.session = self.create_sessionid() self.apicat = self.get_envariable( ConnectionEstablisher.APICLIENT_ACCESS_TOKEN) self.mongoaccess = self.get_envariable( ConnectionEstablisher.MONGODB_ACCESS) self.apisecure = ApiConnector(self.session, self.LANGUAGE, self.apicat) self.mongoclient = MongoConnector(self.mongoaccess) # A function for creating a unique session id when establishing # a connection with API.ai server, 36 is the limit for a api.ai server def create_sessionid(self): return str(uuid.uuid4())[:36] # Gets the environment variable from the local OS def get_envariable(self, vname): return os.getenv(vname) # Connects to the API.AI by sending in the text # Gets back the http response object and converts them into json and # returns it back def api_connect(self, text): self.apisecure = ApiConnector( session_id=self.session, lang=self.LANGUAGE, cat=self.apicat) # cat - Client Access Token response = self.apisecure.send_textquery(text).read() response_json = json.loads(response.decode('utf-8')) return response_json # db functions like checking for a record, insert, delete that calls MongoConnectors functions def dbrecord_exists(self, **fields): if self.mongoclient.record_exists( DB=ConnectionEstablisher.DB_NAME, TABLE=ConnectionEstablisher.TABLE_NAME, **fields): return True else: return False def dbrecord_insert(self, **fields): self.mongoclient.insert(ConnectionEstablisher.DB_NAME, ConnectionEstablisher.TABLE_NAME, **fields) def dbrecord_update(self, user_id, **fields): self.mongoclient.update(DB=ConnectionEstablisher.DB_NAME, TABLE=ConnectionEstablisher.TABLE_NAME, user_id=user_id, **fields)
def __init__(self): self.session = self.create_sessionid() self.apicat = self.get_envariable( ConnectionEstablisher.APICLIENT_ACCESS_TOKEN) self.mongoaccess = self.get_envariable( ConnectionEstablisher.MONGODB_ACCESS) self.apisecure = ApiConnector(self.session, self.LANGUAGE, self.apicat) self.mongoclient = MongoConnector(self.mongoaccess)
def three_line_break_analysis_sample(): """ three line break technical analysis sample """ # Three Line Break Parameters # Currency code to analyse currency = 'USD' # number of days to control line_break_control_count = 3 # start and end dates of analyse period start_day = datetime.datetime(2015, 6, 1) end_day = datetime.datetime(2016, 1, 23) # Reading rates with given parameters to be analysed mc = MongoConnector() rates = mc.mongo_get_rates(currency, start_day, end_day, line_break_control_count) # creating TLB instance and analysing the data tlb = ThreeLineBreakAnalysis(rates, line_break_control_count, start_day) result = tlb.analyse() # creating the chart of the result fig = seaborn.plt.figure(figsize=(14, 5)) locs, labels = seaborn.plt.xticks() fig.axes[0].set_title('Three Line Break (' + str(line_break_control_count) + ') EUR/' + currency + ' ' + start_day.strftime('%d/%m/%Y') + ' - ' + end_day.strftime('%d/%m/%Y')) fig.axes[0].set(xticklabels=[]) a = 0 for rs in result: fig.axes[0].plot([a, a], [rs.min_price, rs.max_price], linewidth=0) if rs.color == 'G': # increases are green fig.axes[0].add_patch(Rectangle((a, rs.min_price), 1, rs.max_price - rs.min_price, fill=False, edgecolor='green', lw=1)) else: # decreases are red fig.axes[0].add_patch(Rectangle((a, rs.min_price), 1, rs.max_price - rs.min_price, fill=False, edgecolor='red', lw=1)) a += 1 # saving chart as PNG image file fig.savefig('TLB_Result.png', dpi=400, bbox_inches='tight') print("Created chart PNG file.")
def get_live_rates(): """ Gets the live rates that European Central Bank served. Stores the rates in MongoDB. """ file_name = 'eurofxref-hist.xml' file_url = 'https://www.ecb.europa.eu/stats/eurofxref/eurofxref-hist.xml' # downloading the XML file download_file(file_url, file_name) # parsing the XML file to get the rates rate_data = read_file(file_name) mc = MongoConnector() # removing the previous rates mc.mongo_delete_rates() # inserting rates to db mc.mongo_insert_rates(rate_data)
def rsi_analysis_sample(): """ rsi technical analysis sample """ # RSI Parameters # Currency code to analyse currency = 'USD' # RSI number of days for calculation number_of_days = 14 # start and end dates of analyse period start_day = datetime.datetime(2015, 1, 1) end_day = datetime.datetime(2016, 1, 23) # overbought and oversold values over_bought = 70 over_sold = 30 # Reading rates with given parameters to be analysed mc = MongoConnector() rates = mc.mongo_get_rates(currency, start_day, end_day, number_of_days) # creating RSI instance and analysing the data rsi = RSIAnalysis(number_of_days, rates, start_day) result = rsi.analyse() # creating a chart with two subplots # one for RSI values # other one for real end-day prices fig, axes = seaborn.plt.subplots(2, 1) fig.set_size_inches(14, 5) locs, labels = seaborn.plt.xticks() seaborn.plt.setp(labels, rotation=45) axes[0].plot([x.day for x in result], [x.value for x in result]) axes[0].set_title('RSI(' + str(number_of_days) + ') Overbought: ' + str(over_bought) + ' Oversold: ' + str(over_sold)) axes[0].set(xticklabels=[]) axes[0].set(ylim=(0, 100)) axes[0].plot([x.day for x in result], [over_bought] * len(result)) axes[0].plot([x.day for x in result], [over_sold] * len(result)) axes[1].plot([x.day for x in result], [x.price for x in result]) axes[1].set_title('Real Rates (EUR/' + currency + ')') # saving chart as PNG image file fig.savefig('RSI_Result.png', dpi=400, bbox_inches='tight') print("Created chart PNG file.")
def bollinger_bands_analysis_sample(): """ bollinger bands technical analysis sample """ # Bollinger Bands Parameters # Currency code to analyse currency = 'USD' # number of days for calculation number_of_days = 20 # start and end dates of analyse period start_day = datetime.datetime(2015, 1, 1) end_day = datetime.datetime(2016, 1, 23) # count of standard deviations count_of_std = 2 # Reading rates with given parameters to be analysed mc = MongoConnector() rates = mc.mongo_get_rates(currency, start_day, end_day, number_of_days) # creating Bollinger Bands instance and analysing the data bb = BollingerBandsAnalysis(number_of_days, count_of_std, rates, start_day) result = bb.analyse() # creating the chart fig = seaborn.plt.figure(figsize=(18, 5)) locs, labels = seaborn.plt.xticks() fig.axes[0].plot([x.day for x in result], [x.real_price for x in result], label='Real Prices', color='black', alpha=1, lw=1) fig.axes[0].plot([x.day for x in result], [x.upper_band for x in result], label='Upper Band', color='orange', alpha=0.5, lw=2) fig.axes[0].plot([x.day for x in result], [x.middle_band for x in result], label='Middle Band', color='y', alpha=0.5, lw=2) fig.axes[0].plot([x.day for x in result], [x.lower_band for x in result], label='Lower Band', color='red', alpha=0.5, lw=2) fig.axes[0].set_title('Bollinger Bands(' + str(number_of_days) + ', ' + str(count_of_std) + ') EUR/' + currency + ' ' + start_day.strftime('%d/%m/%Y') + ' - ' + end_day.strftime('%d/%m/%Y')) seaborn.plt.legend(loc='upper right') seaborn.plt.setp(labels, rotation=45) # saving chart as PNG image file fig.savefig('BollingerBands_Result.png', dpi=400, bbox_inches='tight') print("Created chart PNG file.")
def test_connection(self): from MongoConnector import MongoConnector with MongoConnector(log_file="test.log") as mc: articles = mc.client[mc.secrets['MONGODB_NEWS_DB']].articles print( articles.find_one({ "title": "Rags-to-riches story of Nathan's Famous Hot Dogs" })) os.remove("test.log")
class Get_Retweets(object): def __init__(self, mongo_config=MONGO_CONFIG): logging.debug("Establishing MongoDB connection with parameters:- {}".format(mongo_config)) self.cursor = MongoConnector(mongo_config).__connect__() logging.info("Established connection...\n") def get_retweets(self): '''Function that returns the retweets of each tweet collected and stores it in a JSON format ''' tweets_read = 0 logging.debug("Checking for retweets...") with open(OUTPUT_DIRECTORY+F_NAME,'w') as f_out: total_tweets = self.cursor.find().count() logging.debug("Total Number of tweets (historical tweets) : {0}".format(total_tweets)) for tweet in self.cursor.find(): if tweet.get('retweeted_status'): json.dump(tweet, f_out, default=json_util.default) f_out.write('\n') tweets_read += 1 logging.debug("Total Number of retweets (historical tweets) : {0}".format(tweets_read)) logging.debug("Total Number of non-retweets (historical tweets) : {0}".format(total_tweets-tweets_read)) logging.debug("Succesully checked for retweets")
'MONGO_DB': 'tweetCorpus', 'MONGO_HOST': 'localhost', 'MONGO_PORT': 27017 } customWords = [ 'bc', 'http', 'https', 'co', 'com', 'rt', 'one', 'us', 'new', 'lol', 'may', 'get', 'want', 'like', 'love', 'no', 'thank', 'would', 'thanks', 'good', 'much', 'low', 'roger', 'im' ] alphabets = list(map(chr, range(97, 123))) myStopWords = set( stopwords.words('english') + list(punctuation) + customWords + alphabets) # Initialize dbconnector, contracters, tokenizers, lemmatizers dbconnector = MongoConnector(config) contracter = PyContract() tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True) lemmatizer = WordNetLemmatizer() log_file = OUTPUT_DIRECTORY + "/pre_processing_log.log" # Start logging logging.basicConfig(filename=log_file, level=logging.DEBUG, format='%(asctime)s %(message)s') def get_Tweets(user: str) -> dict: ''' This function takes the config file and connects to MongoDB collection. Retrieves the tweet list from the user id and returns a dict object
def __init__(self, mongo_config=MONGO_CONFIG): logging.debug("Establishing MongoDB connection with parameters:- {}".format(mongo_config)) self.cursor = MongoConnector(mongo_config).__connect__() logging.info("Established connection...\n")
import time import logging import sys import json, codecs import timeit import os #import custom libraries from MongoConnector import MongoConnector config1 = { 'MONGO_COLL': 'social_coll', 'MONGO_DB': 'tweetCorpus', 'MONGO_HOST': 'localhost', 'MONGO_PORT': 27017 } cursor2 = MongoConnector(config1).__connect__() coll = config1['MONGO_COLL'] ########## TWITTER API ACCESS KEYS AND TOKENS ############# ACCESS_TOKEN_fol = "****" ACCESS_TOKEN_SECRET_fol = "****" CONSUMER_KEY_fol = "****" CONSUMER_SECRET_fol = "****" ACCESS_TOKEN_fr = "****" ACCESS_TOKEN_SECRET_fr = "****" CONSUMER_KEY_fr = "****" CONSUMER_SECRET_fr = "****" #############################################################
#### DB RELATED STUFF ### RESUME = False if options.resume: optionsrec["RESUME"] = True databaserec = None if options.mongodb != None: optionsrec["DBDRIVE"] = True mongoData = json.loads(open(options.mongodb).read()) client = MongoClient(mongoData["host"], mongoData["port"]) db = client[mongoData["db"]] databaserec = MongoConnector({ "classes": db[mongoData["classes"]], "classes_old": db[mongoData["classes_old"]], "instances": db[mongoData["instances"]], "instances_old": db[mongoData["instances_old"]] }) logging.info("Read DB configuration from " + options.mongodb) mandatory = [] if options.mandatoryfile != None: try: mandatory = json.loads(open(options.mandatoryfile).read()) except: logging.warning( "Failed to read file with mandatory field information.")
def test___init__(self): from MongoConnector import MongoConnector mc = MongoConnector(log_file="test.log") os.remove("test.log")
def test___enter__(self): from MongoConnector import MongoConnector with MongoConnector(log_file="test.log") as mc: pass os.remove("test.log")
def __init__(self, num_distinct_documents=5000, replace_entities=True, max_term_length=127, remove_stopwords=True, custom_stopwords=[ ',', '.', '-', '\xa0', '“', '”', '"', '\n', '—', ':', '?', 'I', '(', ')' ], analyze=False, document_tabe_name="documents", sentence_table_name="sentences", sentence_fields=OrderedDict({ "doc_id": "document_id", "sen_id": "sentence_id", "content": "sentence_text" }), term_table_name="terms", term_sql_format=("term_id", "term_text", "is_entity"), term_occurrence_table_name="term_occurrence", term_occurrence_sql_format=("document_id", "sentence_id", "term_id"), entity_table_name="entities", entity_sql_format=("entity_id", "entity_type"), database="postgres", user="******", password="******", host="127.0.0.1", port=5435, log_file=os.path.join(os.path.dirname(__file__), "logs/TermGenerator.log"), log_level=logging.INFO, log_verbose=True): """ Initializes various parameters, registers logger and MongoConnector, and sets up the limit. :param num_distinct_documents: (int) The number of distinct documents retrieved from the queries. For performance reasons, this should be limited during debugging/development. 0 (Zero) represents no limit, in accordance with the MongoDB standard for .limit(). :param replace_entities: (boolean) Whether or not the entities in the text should be replaced/recognised. The reason for this is that single terms might be merged together to one term, i.e. first and last name: "Dennis" "Aumiller" would be two separate terms in the traditional splitting (replace_entities=False), whereas - if set to true - "Dennis Aumiller" would represent only one entity. :param max_term_length: (int) Indicator of how long the terms are supposed to be (varchar property in table). :param remove_stopwords: (boolean) Determines whether or not stop words are removed. Currently, we are still deciding on the final set, but likely either one (or both) of NLTK and SpaCy's stop word lists. :param custom_stopwords: (list of strings) Additional words that will not be considered at adding-time. :param analyze: (boolean) Whether or not to include analytically relevant metrics. :param document_tabe_name: (str) Name of the table where the document information is stored. :param sentence_table_name: (str) Name of the table where the sentence information will be stored. :param sentence_fields: (OrderedDict) Structure of input to output values from MongoDB to postgres for the sentence table and its fields. :param term_table_name: (str) Name of the Postgres tables for the terms. :param term_sql_format: (tuple) Since those are generated locally, only a tuple of the PostgresColumns suffices. :param term_occurrence_table_name: (str) Name of the Postgres table for the term occurrences :param term_occurrence_sql_format: (tuple) Same as term_sql_format, but for the term occurrences. :param entity_table_name: (str) (Not implemented yet) Name of the table for the entity meta information. :param entity_sql_format: (str) Same as term_sql_format, but for entities. :param database: (str) database name. :param user: (str) User name to get access to the Postgres database. :param password: (str) Corresponding user password. :param host: (IP) IP address (in string format) for the host of the postgres database. :param port: (integer) Port at which to access the database. """ # set up logger self.logger = set_up_logger(__name__, log_file, log_level, log_verbose) self.logger.info("Successfully registered logger to TermGenerator.") # register a MongoConnector self.mc = MongoConnector() self.logger.info( "Successfully registered MongoConnector to TermGenerator.") # PostgresConnector self.pc = PostgresConnector(database, user, password, host, port) self.logger.info( "Successfully registered PostgresConnector to DocumentGenerator.") self.num_distinct_documents = num_distinct_documents # do this earlier since we need it already for the distinct documents. self.document_table_name = document_tabe_name # get the distinct IDs for the documents so we can match against them later # since we have removed parts of the document collection, we have to make sure to get this from Postgres. self.logger.info("Parsing relevant documents from Postgres...") with self.pc as open_pc: open_pc.cursor.execute("SELECT document_id FROM {}".format( self.document_table_name)) self.first_distinct_documents = list(open_pc.cursor.fetchall()) # extract from the tuple structure self.first_distinct_documents = [ el[0] for el in self.first_distinct_documents ] self.logger.info("Retrieved all relevant documents from Postgres.") # additionally restrict if we want only a number of documents. if self.num_distinct_documents != 0: self.logger.info( "Non-zero limit detected. Limiting to the first N entries.") self.first_distinct_documents = self.first_distinct_documents[:self . num_distinct_documents] self.replace_entities = replace_entities self.analyze = analyze self.max_term_length = max_term_length self.nlp = spacy.load("en") # construct dictionary with the entries per document/sentence id pair. Thus, we can later check whether # there are any entities in the current sentence with higher efficiency. self.occurrence_dict = {} self.occurring_entities = [] # start building the term dictionary/set, as well as an occurence map. Since terms will be "post-processed", # it is first created as a list and later cast to Counter and set. self.terms = [] # cast into a set later on. self.term_in_sentence = set() self.term_id = {} self.term_is_entity = {} if self.analyze: self.term_count = Counter() self.entity_count = Counter() self.entities = [] self.sentences = [] self.processed_sentences = [] # Postgres tables if not sentence_fields: self.logger.error("No sentence fields specified!") self.sentence_table_name = sentence_table_name self.sentence_fields = sentence_fields if not term_sql_format: self.logger.error("No term fields specified!") self.term_table_name = term_table_name self.term_sql_format = ", ".join(term_sql_format) if not term_occurrence_sql_format: self.logger.error("No term occurrence fields specified!") self.term_occurrence_table_name = term_occurrence_table_name self.term_occurrence_sql_format = ", ".join(term_occurrence_sql_format) if not entity_sql_format: self.logger.error("No entity fields specified!") self.entity_table_name = entity_table_name self.entity_sql_format = ", ".join(entity_sql_format) # value retrieving parse: self.sentence_values_to_retrieve = { key: 1 for key in self.sentence_fields.keys() } # suppress _id if not present: if "_id" not in self.sentence_values_to_retrieve.keys(): self.sentence_values_to_retrieve["_id"] = 0 self.sentence_sql_format = ", ".join( [value for value in self.sentence_fields.values()]) # create union of stop words, and add potentially custom stop words self.remove_stopwords = remove_stopwords self.removed_counter = 0 self.stopwords = STOP_WORDS.union(set(stopwords.words("english"))) # add custom stopwords. for word in custom_stopwords: self.stopwords.add(word) self.logger.info("Successfully initialized TermGenerator.")
import json import pymongo import sys from pprint import pprint #import custom libraries from MongoConnector import MongoConnector # Load the config dictionary object from the db_config.json file with open("db_config.json", 'r') as f: config = json.load(f) # print the config file pprint(config) # Create a cursor to connect to MongoDB cursor = MongoConnector(config).__connect__() # Get a document from the mongodb collection #documents = cursor.find({}).limit(1) #pprint(list(documents)) # Load the unique users from the file into a list given by unique_users_list\n", unique_users_list = list(), with open("./output/unique_users.txt", 'r', encoding='utf-8') as outfile: unique_users_list = outfile.read().splitlines() # Collect the tweets of the unique users and save them in a dictionary # Store the user info (id_str, screen_name) , tweet info (tweet_id, tweet, tweet_status, truncated), # user mention info(mentions, id_str, names, screen-names), favorite info (favorited, favorite_count), # retweet info (retweeted, retweet_count), reply (replyreply_id, reply_count), quote (quote_status, quote_list)
def __init__(self, fields=OrderedDict({ "_id": "document_id", "title": "title", "feedName": "feedName", "category": "category", "feedURL": "feedURL", "published": "published" }), num_distinct_documents=0, document_table_name="documents", database="postgres", user="******", password="******", host="127.0.0.1", port=5435, log_file=os.path.join(os.path.dirname(__file__), "logs/DocumentGenerator.log"), log_level=logging.INFO, log_verbose=True): """ Initializes context, and sets up documents that will be parsed. Also establishes the PostgresConnector that will later be used to push the retrieved documents. :param fields: (OrderedDict) Key-value pairs that indicate a mapping of fields that should be retrieved (key), and the respective field it should be called in the SQL table. Ordered because SQL tables are. :param num_distinct_documents: (int) As the name indicates, the number of distinct articles that should be used. Mainly for debugging purposes. 0 means all documents will be used, in accordance with MongoDB standards. :param document_table_name: (str) Name of the Postgres table that should contain the documents :param database: (str) database name. :param user: (str) User name to get access to the Postgres database. :param password: (str) Corresponding user password. :param host: (IP) IP address (in string format) for the host of the postgres database. :param port: (integer) Port at which to access the database. :param log_file: (os.path) Path to the file containing the logs. :param log_level: (logging.LEVEL) Specifies the level to be logged. :param log_verbose: (boolean) Specifies whether or not to look to stdout as well. """ # set up logger self.logger = set_up_logger(__name__, log_file, log_level, log_verbose) self.logger.info( "Successfully registered logger to DocumentGenerator.") # register a MongoConnector self.mc = MongoConnector() self.logger.info( "Successfully registered MongoConnector to DocumentGenerator.") self.num_distinct_documents = num_distinct_documents # get the distinct IDs for the documents so we can match against them later if self.num_distinct_documents != 0: self.logger.info( "Non-zero limit detected. Fetching first N distinct document IDs now..." ) with self.mc as open_mc: documents = open_mc.client[open_mc.news].articles self.first_documents = list(documents.find().limit( self.num_distinct_documents)) # for small enough number, and large enough document collection, this is more efficient: self.first_documents = [ el["_id"] for el in self.first_documents ] self.logger.info( "Successfully registered relevant document IDs.") else: # needed to avoid later conflicts self.first_documents = [] # set up PostgresConnector. Since we only use these once, I don't see any reason to store the connection # details locally again. self.pc = PostgresConnector(database, user, password, host, port) self.logger.info( "Successfully registered PostgresConnector to DocumentGenerator.") # format them into a reasonable format self.fields = fields if not self.fields: self.logger.error("No fields for MongoDB table specified!") self.values_to_retrieve = {key: 1 for key in self.fields.keys()} # suppress _id if not wanted, as it is returned by default. if "_id" not in self.values_to_retrieve.keys(): self.values_to_retrieve["_id"] = 0 # TODO self.sql_format = ", ".join([value for value in self.fields.values()]) self.document_table_name = document_table_name # preparation for later. According to PEP8 self.data = [] self.logger.info("Successfully set up DocumentGenerator.")