def __init__(self, debug=False, verbose=False, slackbot_injected=None, slacker_injected=None): self.debug = debug self.verbose = verbose self.config = config.Config() slackbot_token = os.getenv(self.config.slackbot_api_token_env_varname) api_token = os.getenv(self.config.api_token_env_varname) self.slackbot = slackbot_injected or slackbot.Slackbot( config.SLACK_NAME, token=slackbot_token) self.logger = logging.getLogger(__name__) utils.set_up_logger(self.logger, log_level_env_var='DESTALINATOR_LOG_LEVEL', log_to_slack_env_var='DESTALINATOR_LOG_TO_CHANNEL', log_channel=self.config.log_channel, slackbot=self.slackbot) self.destalinator_activated = False if os.getenv(self.config.destalinator_activated_env_varname): self.destalinator_activated = True self.logger.debug("destalinator_activated is %s", self.destalinator_activated) self.slacker = slacker_injected or slacker.Slacker( config.SLACK_NAME, token=api_token, logger=self.logger) self.ds = destalinator.Destalinator( slacker=self.slacker, slackbot=self.slackbot, activated=self.destalinator_activated, logger=self.logger)
def __init__(self, debug=False, verbose=False, slackbot_injected=None, slacker_injected=None): self.debug = debug self.verbose = verbose self.config = config.Config() slackbot_token = os.getenv(self.config.slackbot_api_token_env_varname) api_token = os.getenv(self.config.api_token_env_varname) self.slackbot = slackbot_injected or slackbot.Slackbot(config.SLACK_NAME, token=slackbot_token) self.logger = logging.getLogger(__name__) utils.set_up_logger(self.logger, log_level_env_var='DESTALINATOR_LOG_LEVEL', log_to_slack_env_var='DESTALINATOR_LOG_TO_CHANNEL', log_channel=self.config.log_channel, slackbot=self.slackbot) self.destalinator_activated = False if os.getenv(self.config.destalinator_activated_env_varname): self.destalinator_activated = True self.logger.debug("destalinator_activated is %s", self.destalinator_activated) self.slacker = slacker_injected or slacker.Slacker(config.SLACK_NAME, token=api_token, logger=self.logger) self.ds = destalinator.Destalinator(slacker=self.slacker, slackbot=self.slackbot, activated=self.destalinator_activated, logger=self.logger)
def __init__(self, logging_flag=''): app_name = __file__.split('.')[0] utils.set_up_logger(app_name, logging_flag) # Misc. class attributes self.cursor = None # Counters self.count_discarded_urls = 0 self.count_crawled_pages = 0 self.count_downloaded_pages = 0 self.count_no_links_found_pages = 0 self.total_links_added = 0 # Timers self.crawl_time = 0 self.now = ''
def __init__(self, prefix="entity", window_size=2, entities_only=True, port=5436, log_file=os.path.join(os.path.dirname(__file__), "logs/SchemaCreator.log"), log_level=logging.INFO, log_verbose=True ): """ Set up. :param prefix: (str) Prefix to the table names. :param port: (int) Used to connect to the Postgres tables. :param log_file: (os.path) Path to the file containing the logs. :param log_level: (logging.LEVEL) Specifies the level to be logged. :param log_verbose: (boolean) Specifies whether or not to look to stdout as well. """ self.logger = set_up_logger(__name__, log_file, log_level, log_verbose) self.window_size = window_size self.prefix = prefix + "_" + str(self.window_size) self.entities_only = entities_only self.names = self.get_names(self.prefix) self.port = port self.pc = PostgresConnector(port=port) self.logger.info("Successfully registered SchemaGenerator.")
def __init__(self, secrets=os.path.join(os.path.dirname(__file__), "secrets.json"), log_file=os.path.join(os.path.dirname(__file__), "logs/MongoConnector.log"), log_level=logging.INFO, log_verbose=True): """ :param secrets: (os.path) Path to the file containing the relevant parameters for the login via a SSHTunnel into the MongoDB. Will be treated as a JSON object (i.e. dictionary in Python), and must contain the following parameters: SSH_HOST, specifies the tunnel address LDAP_USER, user name for ssh login LDAP_PASSWORD, corresponding password MONGODB_PORT, usually 27016 (careful, this has to be an integer) MONGODB_HOST, host address of the mongodb, usually localhost MONGODB_AUTH_DB, database which is owned by user MONGODB_AUTH_USER, username for mongodb MONGODB_AUTH_PW, corresponding password MONGODB_NEWS_DB, database which contains the relevant news articles. Default location is assumed to be in the same directory as the class definition. :param log_file: (os.path) Path to the file containing the logs :param log_level: (logging.LEVEL) Specifies the level to be logged :param log_verbose: (boolean) Specifies whether or not to look to stdout as well. """ # read secrets with open(secrets, "r") as secret_file: self.secrets = json.load(secret_file) # set up log file. Specify name and level, and also print message time to it. self.logger = set_up_logger(__name__, log_file, log_level, log_verbose) self.logger.info("Successfully registered logger to MongoConnector.") # name abstraction, since it is frequently used self.news = self.secrets['MONGODB_NEWS_DB']
def test_set_up_logger(self): from utils import set_up_logger from logging import Logger logger = set_up_logger("test", "test.log") self.assertIsInstance(logger, Logger) os.remove("test.log")
def __init__(self, logging_flag): app_name = __file__.split('.')[0] utils.set_up_logger(app_name, logging_flag) # Misc. class attributes self.soup = None self.hashed_soup = None self.compressed_soup = None self.cursor = None # Counters self.urlerrors = 0 self.count_prospective_pages = 0 self.count_saved = 0 self.count_discarded_pages = 0 # Timers self.start_time = time.time() self.request_time = 0 self.disk_save_time = 0
def __init__(self, num_distinct_documents=5000, replace_entities=True, max_term_length=127, remove_stopwords=True, custom_stopwords=[ ',', '.', '-', '\xa0', '“', '”', '"', '\n', '—', ':', '?', 'I', '(', ')' ], analyze=False, document_tabe_name="documents", sentence_table_name="sentences", sentence_fields=OrderedDict({ "doc_id": "document_id", "sen_id": "sentence_id", "content": "sentence_text" }), term_table_name="terms", term_sql_format=("term_id", "term_text", "is_entity"), term_occurrence_table_name="term_occurrence", term_occurrence_sql_format=("document_id", "sentence_id", "term_id"), entity_table_name="entities", entity_sql_format=("entity_id", "entity_type"), database="postgres", user="******", password="******", host="127.0.0.1", port=5435, log_file=os.path.join(os.path.dirname(__file__), "logs/TermGenerator.log"), log_level=logging.INFO, log_verbose=True): """ Initializes various parameters, registers logger and MongoConnector, and sets up the limit. :param num_distinct_documents: (int) The number of distinct documents retrieved from the queries. For performance reasons, this should be limited during debugging/development. 0 (Zero) represents no limit, in accordance with the MongoDB standard for .limit(). :param replace_entities: (boolean) Whether or not the entities in the text should be replaced/recognised. The reason for this is that single terms might be merged together to one term, i.e. first and last name: "Dennis" "Aumiller" would be two separate terms in the traditional splitting (replace_entities=False), whereas - if set to true - "Dennis Aumiller" would represent only one entity. :param max_term_length: (int) Indicator of how long the terms are supposed to be (varchar property in table). :param remove_stopwords: (boolean) Determines whether or not stop words are removed. Currently, we are still deciding on the final set, but likely either one (or both) of NLTK and SpaCy's stop word lists. :param custom_stopwords: (list of strings) Additional words that will not be considered at adding-time. :param analyze: (boolean) Whether or not to include analytically relevant metrics. :param document_tabe_name: (str) Name of the table where the document information is stored. :param sentence_table_name: (str) Name of the table where the sentence information will be stored. :param sentence_fields: (OrderedDict) Structure of input to output values from MongoDB to postgres for the sentence table and its fields. :param term_table_name: (str) Name of the Postgres tables for the terms. :param term_sql_format: (tuple) Since those are generated locally, only a tuple of the PostgresColumns suffices. :param term_occurrence_table_name: (str) Name of the Postgres table for the term occurrences :param term_occurrence_sql_format: (tuple) Same as term_sql_format, but for the term occurrences. :param entity_table_name: (str) (Not implemented yet) Name of the table for the entity meta information. :param entity_sql_format: (str) Same as term_sql_format, but for entities. :param database: (str) database name. :param user: (str) User name to get access to the Postgres database. :param password: (str) Corresponding user password. :param host: (IP) IP address (in string format) for the host of the postgres database. :param port: (integer) Port at which to access the database. """ # set up logger self.logger = set_up_logger(__name__, log_file, log_level, log_verbose) self.logger.info("Successfully registered logger to TermGenerator.") # register a MongoConnector self.mc = MongoConnector() self.logger.info( "Successfully registered MongoConnector to TermGenerator.") # PostgresConnector self.pc = PostgresConnector(database, user, password, host, port) self.logger.info( "Successfully registered PostgresConnector to DocumentGenerator.") self.num_distinct_documents = num_distinct_documents # do this earlier since we need it already for the distinct documents. self.document_table_name = document_tabe_name # get the distinct IDs for the documents so we can match against them later # since we have removed parts of the document collection, we have to make sure to get this from Postgres. self.logger.info("Parsing relevant documents from Postgres...") with self.pc as open_pc: open_pc.cursor.execute("SELECT document_id FROM {}".format( self.document_table_name)) self.first_distinct_documents = list(open_pc.cursor.fetchall()) # extract from the tuple structure self.first_distinct_documents = [ el[0] for el in self.first_distinct_documents ] self.logger.info("Retrieved all relevant documents from Postgres.") # additionally restrict if we want only a number of documents. if self.num_distinct_documents != 0: self.logger.info( "Non-zero limit detected. Limiting to the first N entries.") self.first_distinct_documents = self.first_distinct_documents[:self . num_distinct_documents] self.replace_entities = replace_entities self.analyze = analyze self.max_term_length = max_term_length self.nlp = spacy.load("en") # construct dictionary with the entries per document/sentence id pair. Thus, we can later check whether # there are any entities in the current sentence with higher efficiency. self.occurrence_dict = {} self.occurring_entities = [] # start building the term dictionary/set, as well as an occurence map. Since terms will be "post-processed", # it is first created as a list and later cast to Counter and set. self.terms = [] # cast into a set later on. self.term_in_sentence = set() self.term_id = {} self.term_is_entity = {} if self.analyze: self.term_count = Counter() self.entity_count = Counter() self.entities = [] self.sentences = [] self.processed_sentences = [] # Postgres tables if not sentence_fields: self.logger.error("No sentence fields specified!") self.sentence_table_name = sentence_table_name self.sentence_fields = sentence_fields if not term_sql_format: self.logger.error("No term fields specified!") self.term_table_name = term_table_name self.term_sql_format = ", ".join(term_sql_format) if not term_occurrence_sql_format: self.logger.error("No term occurrence fields specified!") self.term_occurrence_table_name = term_occurrence_table_name self.term_occurrence_sql_format = ", ".join(term_occurrence_sql_format) if not entity_sql_format: self.logger.error("No entity fields specified!") self.entity_table_name = entity_table_name self.entity_sql_format = ", ".join(entity_sql_format) # value retrieving parse: self.sentence_values_to_retrieve = { key: 1 for key in self.sentence_fields.keys() } # suppress _id if not present: if "_id" not in self.sentence_values_to_retrieve.keys(): self.sentence_values_to_retrieve["_id"] = 0 self.sentence_sql_format = ", ".join( [value for value in self.sentence_fields.values()]) # create union of stop words, and add potentially custom stop words self.remove_stopwords = remove_stopwords self.removed_counter = 0 self.stopwords = STOP_WORDS.union(set(stopwords.words("english"))) # add custom stopwords. for word in custom_stopwords: self.stopwords.add(word) self.logger.info("Successfully initialized TermGenerator.")
def __init__(self, window_size=2, limit_edges=False, entities_only=False, document_table_name="documents", sentence_table_name="sentences", entity_table_name="entities", term_table_name="terms", term_occurrence_table_name="term_occurrence", hyperedge_table_name="hyperedges", hyperedge_format=("edge_id", "term_id", "pos"), hyperedge_document_table_name="hyperedge_document", hyperedge_document_format=("edge_id", "document_id"), hyperedge_sentence_table_name="hyperedge_sentences", hyperedge_sentence_format=("edge_id", "document_id", "sentence_id", "pos"), database="postgres", user="******", password="******", host="127.0.0.1", port=5435, log_file=os.path.join(os.path.dirname(__file__), "logs/HyperedgeGenerator.log"), log_level=logging.INFO, log_verbose=True): """ Initializes hyper edge generator class. :param window_size: (int) Number of sentences in each direction that will determine the context window size of the algorithm. :param limit_edges: (boolean) Experimental: Should limit the maximum number of terms per hyperedge. This would only be useful in context with other theoretical results. :param entities_only: (boolean) Indicating whether or not we should only take into account entity terms, and not the entirety of all term occurrences for the edges. :param document_table_name: (str) Name of the table where documents are stored. :param sentence_table_name: (str) Name of the table containing the sentences and their content. :param entity_table_name: (str) Name of the table containing the entity information and their properties. :param term_table_name: (str) Name of the table containing the terms and meta data. :param term_occurrence_table_name: (str) Name of the table containing term occurrence data. :param hyperedge_table_name: (str) Name of the table containing the general hyper edge information. :param hyperedge_format: (str) Table structure of hyper edge table. :param hyperedge_document_table_name: (str) Name of the table containing the document classification. :param hyperedge_document_format: (str) Table structure of hyper edge document table. :param hyperedge_sentence_table_name: (str) Name of the tale containing the hyper edge sentence data. :param hyperedge_sentence_format: (str) Table structure of the hyper edge sentence table. :param database: (str) database name. :param user: (str) User name to get access to the Postgres database. :param password: (str) Corresponding user password. :param host: (IP) IP address (in string format) for the host of the postgres database. :param port: (integer) Port at which to access the database. :param log_file: (os.path) Path to the file containing the logs. :param log_level: (logging.LEVEL) Specifies the level to be logged. :param log_verbose: (boolean) Specifies whether or not to look to stdout as well. """ self.logger = set_up_logger(__name__, log_file, log_level, log_verbose) self.logger.info( "Successfully registered logger to HyperedgeGenerator.") # important for hyperedges self.window_size = window_size self.limit_edges = limit_edges self.entities_only = entities_only # table names self.document_table_name = document_table_name self.sentence_table_name = sentence_table_name self.entity_table_name = entity_table_name self.term_table_name = term_table_name self.term_occurrence_table_name = term_occurrence_table_name self.hyperedge_table_name = hyperedge_table_name self.hyperedge_document_table_name = hyperedge_document_table_name self.hyperedge_sentence_table_name = hyperedge_sentence_table_name self.hyperedge_format = ", ".join([el for el in hyperedge_format]) self.hyperedge_document_format = ", ".join( [el for el in hyperedge_document_format]) self.hyperedge_sentence_format = ",".join( [el for el in hyperedge_sentence_format]) self.pc = PostgresConnector(database, user, password, host, port) self.logger.info( "Successfully registered PostgresConnector to HyperedgeGenerator.") self.hyperedge = [] self.hyperedge_sentence = [] self.hyperedge_document = [] self.all_hyperedges = [] self.all_hyperedge_sentences = [] # set up the "hyper edge ID counter", which is simply consecutive from 1. with self.pc as open_pc: if not check_table_existence(self.logger, open_pc, self.hyperedge_table_name): return 0 self.logger.info("Retrieving current hyper edge ID key...") open_pc.cursor.execute( "SELECT COUNT(DISTINCT h.edge_id) FROM {} as h".format( self.hyperedge_table_name)) # either start with 1 or get the current maximum self.hyperedge_ID = max(1, open_pc.cursor.fetchone()[0])
parser.add_argument('username', help='db user') parser.add_argument('host', help='db host') parser.add_argument('db_name', help='db name') parser.add_argument('password', help='username db password') parser.add_argument('--version', '-v', action='version', version='%(prog)s 1.0') parser.add_argument('--debug', action='store_true') parser.add_argument('--updateVersion') args = parser.parse_args() # we can assume the arguments are correct at this point logger = utils.set_up_logger() def main(): DB_URL = 'mysql://{user}:{passwd}@{host}/{db}'.format(host=args.host, user=args.username,\ passwd=args.password,db=args.db_name) logger.debug('DB_URL: {url}'.format(url=DB_URL)) # Error handling for possible sql path location errors. try: os.path.exists(args.path_to_sql) or os.path.isdir(args.path_to_sql) except IOError as e: logger.error(e) sys.exit(1)
def __init__(self, fields=OrderedDict({ "_id": "document_id", "title": "title", "feedName": "feedName", "category": "category", "feedURL": "feedURL", "published": "published" }), num_distinct_documents=0, document_table_name="documents", database="postgres", user="******", password="******", host="127.0.0.1", port=5435, log_file=os.path.join(os.path.dirname(__file__), "logs/DocumentGenerator.log"), log_level=logging.INFO, log_verbose=True): """ Initializes context, and sets up documents that will be parsed. Also establishes the PostgresConnector that will later be used to push the retrieved documents. :param fields: (OrderedDict) Key-value pairs that indicate a mapping of fields that should be retrieved (key), and the respective field it should be called in the SQL table. Ordered because SQL tables are. :param num_distinct_documents: (int) As the name indicates, the number of distinct articles that should be used. Mainly for debugging purposes. 0 means all documents will be used, in accordance with MongoDB standards. :param document_table_name: (str) Name of the Postgres table that should contain the documents :param database: (str) database name. :param user: (str) User name to get access to the Postgres database. :param password: (str) Corresponding user password. :param host: (IP) IP address (in string format) for the host of the postgres database. :param port: (integer) Port at which to access the database. :param log_file: (os.path) Path to the file containing the logs. :param log_level: (logging.LEVEL) Specifies the level to be logged. :param log_verbose: (boolean) Specifies whether or not to look to stdout as well. """ # set up logger self.logger = set_up_logger(__name__, log_file, log_level, log_verbose) self.logger.info( "Successfully registered logger to DocumentGenerator.") # register a MongoConnector self.mc = MongoConnector() self.logger.info( "Successfully registered MongoConnector to DocumentGenerator.") self.num_distinct_documents = num_distinct_documents # get the distinct IDs for the documents so we can match against them later if self.num_distinct_documents != 0: self.logger.info( "Non-zero limit detected. Fetching first N distinct document IDs now..." ) with self.mc as open_mc: documents = open_mc.client[open_mc.news].articles self.first_documents = list(documents.find().limit( self.num_distinct_documents)) # for small enough number, and large enough document collection, this is more efficient: self.first_documents = [ el["_id"] for el in self.first_documents ] self.logger.info( "Successfully registered relevant document IDs.") else: # needed to avoid later conflicts self.first_documents = [] # set up PostgresConnector. Since we only use these once, I don't see any reason to store the connection # details locally again. self.pc = PostgresConnector(database, user, password, host, port) self.logger.info( "Successfully registered PostgresConnector to DocumentGenerator.") # format them into a reasonable format self.fields = fields if not self.fields: self.logger.error("No fields for MongoDB table specified!") self.values_to_retrieve = {key: 1 for key in self.fields.keys()} # suppress _id if not wanted, as it is returned by default. if "_id" not in self.values_to_retrieve.keys(): self.values_to_retrieve["_id"] = 0 # TODO self.sql_format = ", ".join([value for value in self.fields.values()]) self.document_table_name = document_table_name # preparation for later. According to PEP8 self.data = [] self.logger.info("Successfully set up DocumentGenerator.")
./find_reactions_by_stoichiometry -h MIT License Zachary King 2015 """ from utils import io_args, load_model_any_type, save_model_with_type, set_up_logger import requests import logging from collections import defaultdict # get command line args and logger args = io_args() set_up_logger(**args) # load the cobra model model = load_model_any_type(usage=usage, **args) def has_reaction_id(reaction_id): url = ( args['host'].rstrip('/') + '/universal/reactions/' + reaction_id ) res = requests.get(url) return res.status_code != 404 # update the model logging.debug('Running queries') categories = defaultdict(list)