Esempio n. 1
0
    def __init__(self,
                 debug=False,
                 verbose=False,
                 slackbot_injected=None,
                 slacker_injected=None):
        self.debug = debug
        self.verbose = verbose
        self.config = config.Config()
        slackbot_token = os.getenv(self.config.slackbot_api_token_env_varname)
        api_token = os.getenv(self.config.api_token_env_varname)

        self.slackbot = slackbot_injected or slackbot.Slackbot(
            config.SLACK_NAME, token=slackbot_token)

        self.logger = logging.getLogger(__name__)
        utils.set_up_logger(self.logger,
                            log_level_env_var='DESTALINATOR_LOG_LEVEL',
                            log_to_slack_env_var='DESTALINATOR_LOG_TO_CHANNEL',
                            log_channel=self.config.log_channel,
                            slackbot=self.slackbot)

        self.destalinator_activated = False
        if os.getenv(self.config.destalinator_activated_env_varname):
            self.destalinator_activated = True
        self.logger.debug("destalinator_activated is %s",
                          self.destalinator_activated)

        self.slacker = slacker_injected or slacker.Slacker(
            config.SLACK_NAME, token=api_token, logger=self.logger)

        self.ds = destalinator.Destalinator(
            slacker=self.slacker,
            slackbot=self.slackbot,
            activated=self.destalinator_activated,
            logger=self.logger)
    def __init__(self, debug=False, verbose=False, slackbot_injected=None, slacker_injected=None):
        self.debug = debug
        self.verbose = verbose
        self.config = config.Config()
        slackbot_token = os.getenv(self.config.slackbot_api_token_env_varname)
        api_token = os.getenv(self.config.api_token_env_varname)

        self.slackbot = slackbot_injected or slackbot.Slackbot(config.SLACK_NAME, token=slackbot_token)

        self.logger = logging.getLogger(__name__)
        utils.set_up_logger(self.logger,
                            log_level_env_var='DESTALINATOR_LOG_LEVEL',
                            log_to_slack_env_var='DESTALINATOR_LOG_TO_CHANNEL',
                            log_channel=self.config.log_channel,
                            slackbot=self.slackbot)

        self.destalinator_activated = False
        if os.getenv(self.config.destalinator_activated_env_varname):
            self.destalinator_activated = True
        self.logger.debug("destalinator_activated is %s", self.destalinator_activated)

        self.slacker = slacker_injected or slacker.Slacker(config.SLACK_NAME, token=api_token, logger=self.logger)

        self.ds = destalinator.Destalinator(slacker=self.slacker,
                                            slackbot=self.slackbot,
                                            activated=self.destalinator_activated,
                                            logger=self.logger)
Esempio n. 3
0
 def __init__(self, logging_flag=''):
     app_name = __file__.split('.')[0]
     utils.set_up_logger(app_name, logging_flag)
     # Misc. class attributes
     self.cursor = None
     # Counters
     self.count_discarded_urls = 0
     self.count_crawled_pages = 0
     self.count_downloaded_pages = 0
     self.count_no_links_found_pages = 0
     self.total_links_added = 0
     # Timers
     self.crawl_time = 0
     self.now = ''
Esempio n. 4
0
 def __init__(self,
              prefix="entity",
              window_size=2,
              entities_only=True,
              port=5436,
              log_file=os.path.join(os.path.dirname(__file__), "logs/SchemaCreator.log"),
              log_level=logging.INFO,
              log_verbose=True
              ):
     """
     Set up.
     :param prefix: (str) Prefix to the table names.
     :param port: (int) Used to connect to the Postgres tables.
     :param log_file: (os.path) Path to the file containing the logs.
     :param log_level: (logging.LEVEL) Specifies the level to be logged.
     :param log_verbose: (boolean) Specifies whether or not to look to stdout as well.
     """
     self.logger = set_up_logger(__name__, log_file, log_level, log_verbose)
     self.window_size = window_size
     self.prefix = prefix + "_" + str(self.window_size)
     self.entities_only = entities_only
     self.names = self.get_names(self.prefix)
     self.port = port
     self.pc = PostgresConnector(port=port)
     self.logger.info("Successfully registered SchemaGenerator.")
Esempio n. 5
0
    def __init__(self,
                 secrets=os.path.join(os.path.dirname(__file__),
                                      "secrets.json"),
                 log_file=os.path.join(os.path.dirname(__file__),
                                       "logs/MongoConnector.log"),
                 log_level=logging.INFO,
                 log_verbose=True):
        """
        :param secrets: (os.path) Path to the file containing the relevant parameters for the login via a SSHTunnel
                into the MongoDB. Will be treated as a JSON object (i.e. dictionary in Python), and must contain
                the following parameters:
                SSH_HOST, specifies the tunnel address
                LDAP_USER, user name for ssh login
                LDAP_PASSWORD, corresponding password
                MONGODB_PORT, usually 27016 (careful, this has to be an integer)
                MONGODB_HOST, host address of the mongodb, usually localhost
                MONGODB_AUTH_DB, database which is owned by user
                MONGODB_AUTH_USER, username for mongodb
                MONGODB_AUTH_PW, corresponding password
                MONGODB_NEWS_DB, database which contains the relevant news articles.
                        Default location is assumed to be in the same directory as the class definition.
        :param log_file: (os.path) Path to the file containing the logs
        :param log_level: (logging.LEVEL) Specifies the level to be logged
        :param log_verbose: (boolean) Specifies whether or not to look to stdout as well.
        """
        # read secrets
        with open(secrets, "r") as secret_file:
            self.secrets = json.load(secret_file)

        # set up log file. Specify name and level, and also print message time to it.
        self.logger = set_up_logger(__name__, log_file, log_level, log_verbose)
        self.logger.info("Successfully registered logger to MongoConnector.")
        # name abstraction, since it is frequently used
        self.news = self.secrets['MONGODB_NEWS_DB']
    def test_set_up_logger(self):
        from utils import set_up_logger
        from logging import Logger

        logger = set_up_logger("test", "test.log")
        self.assertIsInstance(logger, Logger)
        os.remove("test.log")
Esempio n. 7
0
 def __init__(self, logging_flag):
     app_name = __file__.split('.')[0]
     utils.set_up_logger(app_name, logging_flag)
     # Misc. class attributes
     self.soup = None
     self.hashed_soup = None
     self.compressed_soup = None
     self.cursor = None
     # Counters
     self.urlerrors = 0
     self.count_prospective_pages = 0
     self.count_saved = 0
     self.count_discarded_pages = 0
     # Timers
     self.start_time = time.time()
     self.request_time = 0
     self.disk_save_time = 0
    def __init__(self,
                 num_distinct_documents=5000,
                 replace_entities=True,
                 max_term_length=127,
                 remove_stopwords=True,
                 custom_stopwords=[
                     ',', '.', '-', '\xa0', '“', '”', '"', '\n', '—', ':', '?',
                     'I', '(', ')'
                 ],
                 analyze=False,
                 document_tabe_name="documents",
                 sentence_table_name="sentences",
                 sentence_fields=OrderedDict({
                     "doc_id": "document_id",
                     "sen_id": "sentence_id",
                     "content": "sentence_text"
                 }),
                 term_table_name="terms",
                 term_sql_format=("term_id", "term_text", "is_entity"),
                 term_occurrence_table_name="term_occurrence",
                 term_occurrence_sql_format=("document_id", "sentence_id",
                                             "term_id"),
                 entity_table_name="entities",
                 entity_sql_format=("entity_id", "entity_type"),
                 database="postgres",
                 user="******",
                 password="******",
                 host="127.0.0.1",
                 port=5435,
                 log_file=os.path.join(os.path.dirname(__file__),
                                       "logs/TermGenerator.log"),
                 log_level=logging.INFO,
                 log_verbose=True):
        """
        Initializes various parameters, registers logger and MongoConnector, and sets up the limit.
        :param num_distinct_documents: (int) The number of distinct documents retrieved from the queries.
               For performance reasons, this should be limited during debugging/development.
               0 (Zero) represents no limit, in accordance with the MongoDB standard for .limit().
        :param replace_entities: (boolean) Whether or not the entities in the text should be replaced/recognised.
               The reason for this is that single terms might be merged together to one term, i.e. first and last name:
               "Dennis" "Aumiller" would be two separate terms in the traditional splitting (replace_entities=False),
               whereas - if set to true - "Dennis Aumiller" would represent only one entity.
        :param max_term_length: (int) Indicator of how long the terms are supposed to be (varchar property in table).
        :param remove_stopwords: (boolean) Determines whether or not stop words are removed. Currently, we are still
               deciding on the final set, but likely either one (or both) of NLTK and SpaCy's stop word lists.
        :param custom_stopwords: (list of strings) Additional words that will not be considered at adding-time.
        :param analyze: (boolean) Whether or not to include analytically relevant metrics.
        :param document_tabe_name: (str) Name of the table where the document information is stored.
        :param sentence_table_name: (str) Name of the table where the sentence information will be stored.
        :param sentence_fields: (OrderedDict) Structure of input to output values from MongoDB to postgres for the
               sentence table and its fields.
        :param term_table_name: (str) Name of the Postgres tables for the terms.
        :param term_sql_format: (tuple) Since those are generated locally, only a tuple of the PostgresColumns suffices.
        :param term_occurrence_table_name: (str) Name of the Postgres table for the term occurrences
        :param term_occurrence_sql_format: (tuple) Same as term_sql_format, but for the term occurrences.
        :param entity_table_name: (str) (Not implemented yet) Name of the table for the entity meta information.
        :param entity_sql_format: (str) Same as term_sql_format, but for entities.
        :param database: (str) database name.
        :param user: (str) User name to get access to the Postgres database.
        :param password: (str) Corresponding user password.
        :param host: (IP) IP address (in string format) for the host of the postgres database.
        :param port: (integer) Port at which to access the database.
        """
        # set up logger
        self.logger = set_up_logger(__name__, log_file, log_level, log_verbose)
        self.logger.info("Successfully registered logger to TermGenerator.")

        # register a MongoConnector
        self.mc = MongoConnector()
        self.logger.info(
            "Successfully registered MongoConnector to TermGenerator.")

        # PostgresConnector
        self.pc = PostgresConnector(database, user, password, host, port)
        self.logger.info(
            "Successfully registered PostgresConnector to DocumentGenerator.")

        self.num_distinct_documents = num_distinct_documents
        # do this earlier since we need it already for the distinct documents.
        self.document_table_name = document_tabe_name
        # get the distinct IDs for the documents so we can match against them later
        # since we have removed parts of the document collection, we have to make sure to get this from Postgres.
        self.logger.info("Parsing relevant documents from Postgres...")
        with self.pc as open_pc:
            open_pc.cursor.execute("SELECT document_id FROM {}".format(
                self.document_table_name))
            self.first_distinct_documents = list(open_pc.cursor.fetchall())
            # extract from the tuple structure
            self.first_distinct_documents = [
                el[0] for el in self.first_distinct_documents
            ]
            self.logger.info("Retrieved all relevant documents from Postgres.")

        # additionally restrict if we want only a number of documents.
        if self.num_distinct_documents != 0:
            self.logger.info(
                "Non-zero limit detected. Limiting to the first N entries.")
            self.first_distinct_documents = self.first_distinct_documents[:self
                                                                          .
                                                                          num_distinct_documents]

        self.replace_entities = replace_entities
        self.analyze = analyze

        self.max_term_length = max_term_length

        self.nlp = spacy.load("en")

        # construct dictionary with the entries per document/sentence id pair. Thus, we can later check whether
        # there are any entities in the current sentence with higher efficiency.
        self.occurrence_dict = {}
        self.occurring_entities = []

        # start building the term dictionary/set, as well as an occurence map. Since terms will be "post-processed",
        # it is first created as a list and later cast to Counter and set.
        self.terms = []  # cast into a set later on.
        self.term_in_sentence = set()
        self.term_id = {}
        self.term_is_entity = {}
        if self.analyze:
            self.term_count = Counter()
            self.entity_count = Counter()

        self.entities = []
        self.sentences = []
        self.processed_sentences = []

        # Postgres tables
        if not sentence_fields:
            self.logger.error("No sentence fields specified!")
        self.sentence_table_name = sentence_table_name
        self.sentence_fields = sentence_fields
        if not term_sql_format:
            self.logger.error("No term fields specified!")
        self.term_table_name = term_table_name
        self.term_sql_format = ", ".join(term_sql_format)
        if not term_occurrence_sql_format:
            self.logger.error("No term occurrence fields specified!")
        self.term_occurrence_table_name = term_occurrence_table_name
        self.term_occurrence_sql_format = ", ".join(term_occurrence_sql_format)
        if not entity_sql_format:
            self.logger.error("No entity fields specified!")
        self.entity_table_name = entity_table_name
        self.entity_sql_format = ", ".join(entity_sql_format)

        # value retrieving parse:
        self.sentence_values_to_retrieve = {
            key: 1
            for key in self.sentence_fields.keys()
        }
        # suppress _id if not present:
        if "_id" not in self.sentence_values_to_retrieve.keys():
            self.sentence_values_to_retrieve["_id"] = 0
        self.sentence_sql_format = ", ".join(
            [value for value in self.sentence_fields.values()])

        # create union of stop words, and add potentially custom stop words
        self.remove_stopwords = remove_stopwords
        self.removed_counter = 0
        self.stopwords = STOP_WORDS.union(set(stopwords.words("english")))
        # add custom stopwords.
        for word in custom_stopwords:
            self.stopwords.add(word)

        self.logger.info("Successfully initialized TermGenerator.")
    def __init__(self,
                 window_size=2,
                 limit_edges=False,
                 entities_only=False,
                 document_table_name="documents",
                 sentence_table_name="sentences",
                 entity_table_name="entities",
                 term_table_name="terms",
                 term_occurrence_table_name="term_occurrence",
                 hyperedge_table_name="hyperedges",
                 hyperedge_format=("edge_id", "term_id", "pos"),
                 hyperedge_document_table_name="hyperedge_document",
                 hyperedge_document_format=("edge_id", "document_id"),
                 hyperedge_sentence_table_name="hyperedge_sentences",
                 hyperedge_sentence_format=("edge_id", "document_id",
                                            "sentence_id", "pos"),
                 database="postgres",
                 user="******",
                 password="******",
                 host="127.0.0.1",
                 port=5435,
                 log_file=os.path.join(os.path.dirname(__file__),
                                       "logs/HyperedgeGenerator.log"),
                 log_level=logging.INFO,
                 log_verbose=True):
        """
        Initializes hyper edge generator class.
        :param window_size: (int) Number of sentences in each direction that will determine the context window size
               of the algorithm.
        :param limit_edges: (boolean) Experimental: Should limit the maximum number of terms per hyperedge. This would
               only be useful in context with other theoretical results.
        :param entities_only: (boolean) Indicating whether or not we should only take into account entity terms,
               and not the entirety of all term occurrences for the edges.
        :param document_table_name: (str) Name of the table where documents are stored.
        :param sentence_table_name: (str) Name of the table containing the sentences and their content.
        :param entity_table_name: (str) Name of the table containing the entity information and their properties.
        :param term_table_name: (str) Name of the table containing the terms and meta data.
        :param term_occurrence_table_name: (str) Name of the table containing term occurrence data.
        :param hyperedge_table_name: (str) Name of the table containing the general hyper edge information.
        :param hyperedge_format: (str) Table structure of hyper edge table.
        :param hyperedge_document_table_name: (str) Name of the table containing the document classification.
        :param hyperedge_document_format: (str) Table structure of hyper edge document table.
        :param hyperedge_sentence_table_name: (str) Name of the tale containing the hyper edge sentence data.
        :param hyperedge_sentence_format: (str) Table structure of the hyper edge sentence table.
        :param database: (str) database name.
        :param user: (str) User name to get access to the Postgres database.
        :param password: (str) Corresponding user password.
        :param host: (IP) IP address (in string format) for the host of the postgres database.
        :param port: (integer) Port at which to access the database.
        :param log_file: (os.path) Path to the file containing the logs.
        :param log_level: (logging.LEVEL) Specifies the level to be logged.
        :param log_verbose: (boolean) Specifies whether or not to look to stdout as well.
        """

        self.logger = set_up_logger(__name__, log_file, log_level, log_verbose)
        self.logger.info(
            "Successfully registered logger to HyperedgeGenerator.")

        # important for hyperedges
        self.window_size = window_size
        self.limit_edges = limit_edges
        self.entities_only = entities_only

        # table names
        self.document_table_name = document_table_name
        self.sentence_table_name = sentence_table_name
        self.entity_table_name = entity_table_name
        self.term_table_name = term_table_name
        self.term_occurrence_table_name = term_occurrence_table_name
        self.hyperedge_table_name = hyperedge_table_name
        self.hyperedge_document_table_name = hyperedge_document_table_name
        self.hyperedge_sentence_table_name = hyperedge_sentence_table_name

        self.hyperedge_format = ", ".join([el for el in hyperedge_format])
        self.hyperedge_document_format = ", ".join(
            [el for el in hyperedge_document_format])
        self.hyperedge_sentence_format = ",".join(
            [el for el in hyperedge_sentence_format])

        self.pc = PostgresConnector(database, user, password, host, port)
        self.logger.info(
            "Successfully registered PostgresConnector to HyperedgeGenerator.")

        self.hyperedge = []
        self.hyperedge_sentence = []
        self.hyperedge_document = []
        self.all_hyperedges = []
        self.all_hyperedge_sentences = []

        # set up the "hyper edge ID counter", which is simply consecutive from 1.
        with self.pc as open_pc:
            if not check_table_existence(self.logger, open_pc,
                                         self.hyperedge_table_name):
                return 0

            self.logger.info("Retrieving current hyper edge ID key...")
            open_pc.cursor.execute(
                "SELECT COUNT(DISTINCT h.edge_id) FROM {} as h".format(
                    self.hyperedge_table_name))
            # either start with 1 or get the current maximum
            self.hyperedge_ID = max(1, open_pc.cursor.fetchone()[0])
Esempio n. 10
0
parser.add_argument('username', help='db user')
parser.add_argument('host', help='db host')
parser.add_argument('db_name', help='db name')
parser.add_argument('password', help='username db password')
parser.add_argument('--version',
                    '-v',
                    action='version',
                    version='%(prog)s 1.0')
parser.add_argument('--debug', action='store_true')
parser.add_argument('--updateVersion')

args = parser.parse_args()

# we can assume the arguments are correct at this point

logger = utils.set_up_logger()


def main():

    DB_URL = 'mysql://{user}:{passwd}@{host}/{db}'.format(host=args.host, user=args.username,\
        passwd=args.password,db=args.db_name)
    logger.debug('DB_URL: {url}'.format(url=DB_URL))

    # Error handling for possible sql path location errors.
    try:
        os.path.exists(args.path_to_sql) or os.path.isdir(args.path_to_sql)
    except IOError as e:
        logger.error(e)
        sys.exit(1)
Esempio n. 11
0
    def __init__(self,
                 fields=OrderedDict({
                     "_id": "document_id",
                     "title": "title",
                     "feedName": "feedName",
                     "category": "category",
                     "feedURL": "feedURL",
                     "published": "published"
                 }),
                 num_distinct_documents=0,
                 document_table_name="documents",
                 database="postgres",
                 user="******",
                 password="******",
                 host="127.0.0.1",
                 port=5435,
                 log_file=os.path.join(os.path.dirname(__file__),
                                       "logs/DocumentGenerator.log"),
                 log_level=logging.INFO,
                 log_verbose=True):
        """
        Initializes context, and sets up documents that will be parsed.
        Also establishes the PostgresConnector that will later be used to push the retrieved documents.
        :param fields: (OrderedDict) Key-value pairs that indicate a mapping of fields that should be retrieved (key),
               and the respective field it should be called in the SQL table. Ordered because SQL tables are.
        :param num_distinct_documents: (int) As the name indicates, the number of distinct articles that should be used.
               Mainly for debugging purposes. 0 means all documents will be used, in accordance with MongoDB standards.
        :param document_table_name: (str) Name of the Postgres table that should contain the documents
        :param database: (str) database name.
        :param user: (str) User name to get access to the Postgres database.
        :param password: (str) Corresponding user password.
        :param host: (IP) IP address (in string format) for the host of the postgres database.
        :param port: (integer) Port at which to access the database.
        :param log_file: (os.path) Path to the file containing the logs.
        :param log_level: (logging.LEVEL) Specifies the level to be logged.
        :param log_verbose: (boolean) Specifies whether or not to look to stdout as well.
        """

        # set up logger
        self.logger = set_up_logger(__name__, log_file, log_level, log_verbose)
        self.logger.info(
            "Successfully registered logger to DocumentGenerator.")

        # register a MongoConnector
        self.mc = MongoConnector()
        self.logger.info(
            "Successfully registered MongoConnector to DocumentGenerator.")

        self.num_distinct_documents = num_distinct_documents
        # get the distinct IDs for the documents so we can match against them later
        if self.num_distinct_documents != 0:
            self.logger.info(
                "Non-zero limit detected. Fetching first N distinct document IDs now..."
            )
            with self.mc as open_mc:
                documents = open_mc.client[open_mc.news].articles
                self.first_documents = list(documents.find().limit(
                    self.num_distinct_documents))
                # for small enough number, and large enough document collection, this is more efficient:
                self.first_documents = [
                    el["_id"] for el in self.first_documents
                ]
                self.logger.info(
                    "Successfully registered relevant document IDs.")
        else:
            # needed to avoid later conflicts
            self.first_documents = []
        # set up PostgresConnector. Since we only use these once, I don't see any reason to store the connection
        # details locally again.
        self.pc = PostgresConnector(database, user, password, host, port)
        self.logger.info(
            "Successfully registered PostgresConnector to DocumentGenerator.")

        # format them into a reasonable format
        self.fields = fields
        if not self.fields:
            self.logger.error("No fields for MongoDB table specified!")
        self.values_to_retrieve = {key: 1 for key in self.fields.keys()}
        # suppress _id if not wanted, as it is returned by default.
        if "_id" not in self.values_to_retrieve.keys():
            self.values_to_retrieve["_id"] = 0
        # TODO
        self.sql_format = ", ".join([value for value in self.fields.values()])
        self.document_table_name = document_table_name

        # preparation for later. According to PEP8
        self.data = []
        self.logger.info("Successfully set up DocumentGenerator.")
./find_reactions_by_stoichiometry -h

MIT License
Zachary King 2015

"""

from utils import io_args, load_model_any_type, save_model_with_type, set_up_logger

import requests
import logging
from collections import defaultdict

# get command line args and logger
args = io_args()
set_up_logger(**args)

# load the cobra model
model = load_model_any_type(usage=usage, **args)

def has_reaction_id(reaction_id):
    url = (
        args['host'].rstrip('/') +
        '/universal/reactions/' + reaction_id
    )
    res = requests.get(url)
    return res.status_code != 404

# update the model
logging.debug('Running queries')
categories = defaultdict(list)