def commnad_line_arguments():
    parser = argparse.ArgumentParser(
        description=
        "Form a list of rooms containing list of students inside every room.")
    parser.add_argument("students", type=str, help="path to students json")
    parser.add_argument("rooms", type=str, help="path to rooms json")
    parser.add_argument("output_format", type=str, help="output file format")
    parser.add_argument("-o",
                        "--output_path",
                        type=str,
                        help="output file path",
                        default=str(os.getcwd()))
    parser.add_argument("-n",
                        "--output_name",
                        type=str,
                        help="output file name",
                        default="result")
    args = parser.parse_args()

    log = get_logger(__name__)

    savers = {
        "json": SaverToJSON,
        "xml": SaverToXML,
    }
    # Check if valid save format
    try:
        save_to_file = savers[args.output_format].save_to_file
    except KeyError:
        msg = f"Unknown file format {args.output_format}, use one of these: {', '.join(savers.keys())}."
        log.error(msg)
        raise
    return args, log, save_to_file
Example #2
0
import pandas as pd
from logging_config import get_logger

pd.set_option('display.max_columns', None)

data_manager_logger = get_logger("ark.data_manager")


class DataManager:
    def __init__(self, responses_filename):
        data_manager_logger.info("starting __init__ for DataManager")
        self.response_df = pd.read_csv(responses_filename)
        self.response_df.drop_duplicates(subset='Bio: Name', keep="last", inplace=True)
        self.response_df.set_index('Bio: Name', inplace=True)
        self.response_df.index.name = None
        timestamps = self.response_df['Timestamp']
        self.response_df.drop(columns=['Timestamp'], inplace=True)

    @staticmethod
    def sort_responses(df):

        data_manager_logger.info("starting sort_responses")
        try:
            data_manager_logger.info("creating empty question lists")
            bio_questions = []
            pref_questions = []
            int_questions = []
            hab_questions = []
            pers_questions = []
            flat_questions = []
            extra_questions = []
Example #3
0
from const_vars.constant_conll_testb import Dict_Co
from collections import defaultdict
import spacy
import math
import csv
import os
import json
import logging_config
import urllib.parse
import urllib.request
import requests

log = logging_config.get_logger()


def tok_file(toks_dict, m_e_id, ty):
    if not os.path.exists(Dict_Co.res_path):
        os.makedirs(Dict_Co.res_path)
    with open(os.path.join(Dict_Co.res_path,
                           str(m_e_id) + "--" + ty + '.csv'), 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['token', 'count'])
        for tok, idxs in toks_dict.items():
            writer.writerow([tok, len(idxs)])


def create_index(tokens):
    index = defaultdict(list)
    for token_index, token in enumerate(tokens):
        index[token].append(token_index)
    return index
Example #4
0
from datetime import datetime, timedelta

import numpy as np

from logging_config import get_logger

logger = get_logger(__file__)


def presplit_data(item_feature_data,
                  user_item_interaction_data,
                  num_min=3,
                  remove_unk=True,
                  sort=True,
                  test_size_days=14,
                  item_id_type='ITEM IDENTIFIER',
                  ctm_id_type='CUSTOMER IDENTIFIER'):
    """
    Split data into train and test set.

    Parameters
    ----------
    num_min:
        Minimal number of interactions (transactions or clicks) for a customer to be included in the dataset
        (interactions can be both in train and test sets)
    remove_unk:
        Remove items in the interaction set that are not in the item features set, e.g. "items" that are services
        like skate sharpening
    sort:
        Sort the dataset by date before splitting in train/test set,  thus having a test set that is succeeding
        the train set
Example #5
0
This module represents a single point of entry into the log module classes.

Invoking the log aggregator is given below:

    python log_aggregator.py -f <fully qualified file path to log file>
"""

import argparse
import datetime
import logging
import re

import log_models as models
import logging_config

logger = logging_config.get_logger('default')

LOG_PATTERN = re.compile('^(.*)T.*\\t([A-Za-z0-9]+)\\t([XXX|0-9]+)$')


class InvalidDataSetException(Exception):
    """Raised if data set is found to be invalid."""

    def __init__(self, line):
        """Line that represents an invalid data set."""
        super().__init__('Invalid format for line: {}'.format(line))


def parse_logs_and_generate_report(file_path):
    """Initialises the data sets and generates reports.
Example #6
0
import os
from functools import partial
from logging_config import get_logger
from sounds import sounds_config
from sounds.audio_sound_pre_processing import prepare_audio_sound
from sounds.ground_truth_processor import GroundtruthReader
from sounds.model_labeler import ModelLabelEncoder
from sounds.model_predictor import ModelPredictor
import pandas as pd
import numpy as np
from sounds.model_structures import *
from sounds.model_trainer import AudioFeaturesModel, train_and_test_model
from pandas import DataFrame
from utils.file_utils import return_from_path, save_object, load_object

logger = get_logger(__name__)


def save_features(groundtruth, path, dataset_name, filter_label=None):
    gtp = GroundtruthReader(f'{sounds_config.sounds_data_dir}/{groundtruth}')
    prepare_audio_sound_groundtruth = partial(prepare_audio_sound, gtp,
                                              filter_label)
    if not os.path.isdir(f'{sounds_config.sounds_data_dir}/{dataset_name}'):
        os.mkdir(f'{sounds_config.sounds_data_dir}/{dataset_name}')
    ftrs = return_from_path(prepare_audio_sound_groundtruth, path,
                            sounds_config.extension)
    audio_sound_df = DataFrame(ftrs)
    save_object(
        audio_sound_df,
        f'{sounds_config.sounds_data_dir}/{dataset_name}/{os.path.basename(path)}.data'
    )
Example #7
0
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from logging_config import get_logger
from utils import set_indexes

pd.set_option('display.max_columns', None)

scoring_logger = get_logger("ark.scoring")


class Scoring:
    def __init__(self, data_dir):
        scoring_logger.info("starting __init__ for BulkScoring")
        self.bio_df = set_indexes(pd.read_csv(
            '{}/bio_df.csv'.format(data_dir)))
        self.pref_df = set_indexes(
            pd.read_csv('{}/pref_df.csv'.format(data_dir)))
        self.int_df = set_indexes(pd.read_csv(
            '{}/int_df.csv'.format(data_dir)))
        self.habit_df = set_indexes(
            pd.read_csv('{}/habit_df.csv'.format(data_dir)))
        self.pers_df = set_indexes(
            pd.read_csv('{}/pers_df.csv'.format(data_dir)))
        self.flat_df = set_indexes(
            pd.read_csv('{}/flat_df.csv'.format(data_dir)))
        self.extra_df = set_indexes(
            pd.read_csv('{}/extra_df.csv'.format(data_dir)))

    @staticmethod
Example #8
0
class Version:
    """
    A class used to represent a version string to enable semantic comparison between them.


    Attributes
    ----------
    version : str
        Version string as it was inputted.
    __version_comparable : list[int]
        Version encoded as a list of ints to enable comparison.
    dev_stage_to_int_table : dict
        Lookup table for char to int conversion.

    Methods
    -------
    _is_valid_operand(Version instance)
        Checks for comparison ops whether other obj is a Version instance.
    _get_int(char)
        Return a corresponding int for a char from a lookup table.

    Example
    -------
    >>> a = Version("1.0.0")
    >>> b = Version("1.0.0-beta")
    >>> a > b
    True
    """
    log = get_logger(__name__)
    dev_stage_to_int_table = {
        # alpha < beta < rc < final 0
        "r": -1,  # rc
        "c": -1,  # rc
        "b": -2,  # beta
        "a": -3,  # alpha
    }

    # int_to_dev_stage_table = {v: k for k, v in dev_stage_to_int_table.items()}

    def __init__(self, version):
        self.version = str(version)
        self.__version_comparable = [
            int(s) if s.isdigit() else self._get_int(s[0])
            for s in re.findall("\d+|[a-z]+", self.version.lower())
        ]

    # @property   # lazy compute?
    # def version_comparable(self):
    #     version_separated = re.findall("\d+|[a-z]+", self.version_lover)
    #     return [int(s) if s.isdigit() else self._get_int(s[0]) for s in version_separated]

    def __str__(self):
        return self.version

    def __repr__(self):
        return f"Version instance {str(self.__version_comparable)}"

    @staticmethod
    def _is_valid_operand(other):
        return isinstance(other, Version)

    def _get_int(self, char):
        try:
            return self.dev_stage_to_int_table[char]
        except KeyError as error:
            msg = f"Unknown letter {char} in {self.version} REPLACED WITH Beta. " \
                  f"Known are {self.dev_stage_to_int_table.keys()}"
            self.log.warning(msg)
            return self.dev_stage_to_int_table['b']

    def __eq__(self, other):
        if not self._is_valid_operand(other):
            raise NotImplemented
        for left, right in zip_longest(self.__version_comparable,
                                       other.__version_comparable,
                                       fillvalue=0):
            if left == right:
                continue
            else:
                return False
        else:
            return True

    def __lt__(self, other):
        if not self._is_valid_operand(other):
            raise NotImplemented
        for left, right in zip_longest(self.__version_comparable,
                                       other.__version_comparable,
                                       fillvalue=0):
            if left == right:
                continue
            elif left < right:
                return True
            elif left > right:
                return False
        else:
            return False
Example #9
0
import pickle, warnings, uuid
import os.path
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from googleapiclient.discovery import build
import pandas as pd
from logging_config import get_logger

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

google_api_manager_logger = get_logger("ark.google_api_manager")

def gsheet_api_check(SCOPES):
    google_api_manager_logger.info("running gsheet_api_check")
    creds = None
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)
    return creds
class Task4DbHandler(DbExecutor):
    """
    Class to handle all possible interactions with database within the scope of a given task (Task4)..


    __init__
    ----------
    :Bool from_scratch: If True then DROP DATATABLE IF EXISTS, False -> use existing

    """
    log = get_logger(__name__)
    def __init__(self, conncector: ConnectorBase, db_config: dict, from_scratch: bool = False, ):
        """
        :Bool from_scratch: If True then DROP DATATABLE IF EXISTS, False -> use existing
        """
        self.db_name = db_config.pop("database")
        super().__init__(conncector, db_config)
        if from_scratch:
            self.__drop_database()
        self.create_database()

    def __drop_database(self):
        query = f"DROP DATABASE IF EXISTS {self.db_name}"
        with self.cnx.cursor() as cursor:
            cursor.execute(query, )
            self.log.warning(f"DROPPED {self.db_name} IF EXISTS")

    def create_database(self, ):
        query = f"CREATE DATABASE IF NOT EXISTS {self.db_name}"
        with self.cnx.cursor() as cursor:
            cursor.execute(query, )
        query = f"USE {self.db_name}"
        with self.cnx.cursor() as cursor:
            cursor.execute(query, )

    def create_table_rooms(self):
        query = """CREATE TABLE IF NOT EXISTS rooms (
                    id INT NOT NULL ,
                    name VARCHAR(12) NOT NULL
                    )
                """
        with self.cnx.cursor() as cursor:
            cursor.execute(query, )

    def create_table_students(self):
        query = """CREATE TABLE IF NOT EXISTS students (
                    birthday DATETIME NOT NULL,
                    id INT NOT NULL,
                    name VARCHAR(100) NOT NULL,
                    room INT NOT NULL,
                    sex enum('M', 'F') NOT NULL
                    )
                """
        with self.cnx.cursor() as cursor:
            cursor.execute(query, )

    def insert_room(self, dic: dict):
        query = """INSERT INTO rooms
                   (id, name)
                   VALUES (%(id)s, %(name)s)
                """
        with self.cnx.cursor() as cursor:
            cursor.execute(query, dic)

    def insert_rooms(self, list_of_rooms: List[dict]):
        query = """INSERT INTO rooms
                   (id, name)
                   VALUES (%(id)s, %(name)s)
                """
        with self.cnx.cursor() as cursor:
            cursor.executemany(query, list_of_rooms)

    def insert_student(self, dic: dict):
        query = """INSERT INTO students
                   (birthday, id, name, room, sex)
                   VALUES (%(birthday)s, %(id)s, %(name)s, %(room)s, %(sex)s)
                """
        with self.cnx.cursor() as cursor:
            cursor.execute(query, dic)

    def insert_students(self, list_of_students: List[dict]):
        query = """INSERT INTO students
                   (birthday, id, name, room, sex)
                   VALUES (%(birthday)s, %(id)s, %(name)s, %(room)s, %(sex)s)
                """
        with self.cnx.cursor() as cursor:
            cursor.executemany(query, list_of_students)

    def room_population(self):
        """список комнат и количество студентов в каждой из них"""
        query = """
                SELECT r.id, COUNT(s.id) as population
                FROM rooms AS r
                    LEFT JOIN students AS s
                        ON r.id = s.room
                GROUP BY r.id
                ORDER BY population DESC
                """
        with self.cnx.cursor(dictionary=True) as cursor:
            cursor.execute(query, )
            return cursor.fetchall()

    def top5_least_average_age(self):
        """top 5 комнат, где самые маленький средний возраст студентов"""
        query = """
                SELECT room,
                       AVG(DATEDIFF(NOW(), birthday)) AS avg_age
                FROM students
                GROUP BY room
                ORDER BY avg_age ASC
                LIMIT 5
                """
        with self.cnx.cursor(dictionary=True) as cursor:
            cursor.execute(query, )
            return cursor.fetchall()

    def top5_max_diff_age(self):
        """top 5 комнат с самой большой разницей в возрасте студентов"""
        query = """
                WITH min_max AS
                (
                    SELECT room,
                           MAX(birthday) AS max_age,
                           MIN(birthday) AS min_age
                    FROM students
                    GROUP BY room
                )
                SELECT room,
                       MAX(DATEDIFF(max_age, min_age)) AS max_diff
                FROM min_max
                GROUP BY room
                ORDER BY max_diff DESC
                LIMIT 5
                """
        with self.cnx.cursor(dictionary=True) as cursor:
            cursor.execute(query, )
            return cursor.fetchall()

    def mixed_sex(self):
        """список комнат где живут разнополые студенты"""
        query = """
                SELECT room,
                       COUNT(DISTINCT(sex)) AS sex_dst
                FROM students
                GROUP BY room
                HAVING sex_dst > 1
                """
        with self.cnx.cursor(dictionary=True) as cursor:
            cursor.execute(query, )
            return cursor.fetchall()

    def add_indexes(self):
        """добавить использование PRIMERY KEY и FOREIGN KEY в таблицы"""
        query = """
                ALTER TABLE rooms ADD PRIMARY KEY (id)
                """
        with self.cnx.cursor() as cursor:
            cursor.execute(query, )
        query = """
                ALTER TABLE students ADD PRIMARY KEY (id), 
                ADD FOREIGN KEY (room)  REFERENCES rooms (id)
                """
        with self.cnx.cursor() as cursor:
            cursor.execute(query, )
Example #11
0
# import json
import os
from typing import Dict, List, Iterable

from category_class import Category

from logging_config import get_logger


_logger = get_logger(logger_name=__name__)


def log_too_many_elements_in_def(parsed_entry:  Dict[str, List[str]],
                                 definition_tag: str,
                                 ):
    try:
        defini = parsed_entry[definition_tag]
    except KeyError:
        pass
    else:
        if len(defini) > 1:
            _logger.debug("Too Many elements in Definition: " + str(defini))


def log_missing_language(parsed_entry:  Dict[str, List[str]],
                         first_language_tag: str,
                         second_language_tag: str,
                         ):
    try:
        parsed_entry[first_language_tag]
        parsed_entry[second_language_tag]
Example #12
0
import smtplib, ssl
from credentials import email_address, password
from email import encoders
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
import pandas as pd

from logging_config import get_logger
from utils import set_indexes

email_manager_logger = get_logger("ark.email_manager")


class EmailManager:
    def __init__(self, name, csv_filename):
        email_manager_logger.info('starting __init__ for EmailManager')
        self.name = name
        self.port = 465  # For SSL
        self.smtp_server = "smtp.gmail.com"
        self.sender = email_address
        self.password = password
        self.csv_filename = csv_filename
        self.contact_df = set_indexes(
            pd.read_csv('data_tables/contact_df.csv'))
        self.rec_email = self.contact_df.loc[self.name,
                                             'Contact: Email address']

    def create_message(self):
        try:
            email_manager_logger.info("starting create_message")
Example #13
0
import pandas as pd
import numpy as np
from logging_config import get_logger
from utils import set_indexes
pd.set_option('display.max_columns', None)

user_app_logger = get_logger("ark.user_app")


class UserApp:
    def __init__(self, name, score_dict, data_dir):
        user_app_logger.info("starting __init__ for UserApp")
        self.name = name
        self.user_info = score_dict["user_info"].loc[
            score_dict["user_info"].index == name]
        self.bio_score_cos_matrix = score_dict["bio_score_cos_matrix"][[
            self.name
        ]]
        self.pref_matrix = score_dict["pref_matrix"][
            score_dict["pref_matrix"]["pref_name_x"] == self.name]
        self.interest_score_cos_matrix = score_dict[
            "interest_score_cos_matrix"][[self.name]]
        self.habit_score_cos_matrix = score_dict["habit_score_cos_matrix"][[
            self.name
        ]]
        self.personality_score_cos_matrix = score_dict[
            "personality_score_cos_matrix"][[self.name]]
        self.flat_info = score_dict["flat_info"]
        self.contact_df = set_indexes(
            pd.read_csv('{}/contact_df.csv'.format(data_dir)))