def commnad_line_arguments(): parser = argparse.ArgumentParser( description= "Form a list of rooms containing list of students inside every room.") parser.add_argument("students", type=str, help="path to students json") parser.add_argument("rooms", type=str, help="path to rooms json") parser.add_argument("output_format", type=str, help="output file format") parser.add_argument("-o", "--output_path", type=str, help="output file path", default=str(os.getcwd())) parser.add_argument("-n", "--output_name", type=str, help="output file name", default="result") args = parser.parse_args() log = get_logger(__name__) savers = { "json": SaverToJSON, "xml": SaverToXML, } # Check if valid save format try: save_to_file = savers[args.output_format].save_to_file except KeyError: msg = f"Unknown file format {args.output_format}, use one of these: {', '.join(savers.keys())}." log.error(msg) raise return args, log, save_to_file
import pandas as pd from logging_config import get_logger pd.set_option('display.max_columns', None) data_manager_logger = get_logger("ark.data_manager") class DataManager: def __init__(self, responses_filename): data_manager_logger.info("starting __init__ for DataManager") self.response_df = pd.read_csv(responses_filename) self.response_df.drop_duplicates(subset='Bio: Name', keep="last", inplace=True) self.response_df.set_index('Bio: Name', inplace=True) self.response_df.index.name = None timestamps = self.response_df['Timestamp'] self.response_df.drop(columns=['Timestamp'], inplace=True) @staticmethod def sort_responses(df): data_manager_logger.info("starting sort_responses") try: data_manager_logger.info("creating empty question lists") bio_questions = [] pref_questions = [] int_questions = [] hab_questions = [] pers_questions = [] flat_questions = [] extra_questions = []
from const_vars.constant_conll_testb import Dict_Co from collections import defaultdict import spacy import math import csv import os import json import logging_config import urllib.parse import urllib.request import requests log = logging_config.get_logger() def tok_file(toks_dict, m_e_id, ty): if not os.path.exists(Dict_Co.res_path): os.makedirs(Dict_Co.res_path) with open(os.path.join(Dict_Co.res_path, str(m_e_id) + "--" + ty + '.csv'), 'w') as f: writer = csv.writer(f) writer.writerow(['token', 'count']) for tok, idxs in toks_dict.items(): writer.writerow([tok, len(idxs)]) def create_index(tokens): index = defaultdict(list) for token_index, token in enumerate(tokens): index[token].append(token_index) return index
from datetime import datetime, timedelta import numpy as np from logging_config import get_logger logger = get_logger(__file__) def presplit_data(item_feature_data, user_item_interaction_data, num_min=3, remove_unk=True, sort=True, test_size_days=14, item_id_type='ITEM IDENTIFIER', ctm_id_type='CUSTOMER IDENTIFIER'): """ Split data into train and test set. Parameters ---------- num_min: Minimal number of interactions (transactions or clicks) for a customer to be included in the dataset (interactions can be both in train and test sets) remove_unk: Remove items in the interaction set that are not in the item features set, e.g. "items" that are services like skate sharpening sort: Sort the dataset by date before splitting in train/test set, thus having a test set that is succeeding the train set
This module represents a single point of entry into the log module classes. Invoking the log aggregator is given below: python log_aggregator.py -f <fully qualified file path to log file> """ import argparse import datetime import logging import re import log_models as models import logging_config logger = logging_config.get_logger('default') LOG_PATTERN = re.compile('^(.*)T.*\\t([A-Za-z0-9]+)\\t([XXX|0-9]+)$') class InvalidDataSetException(Exception): """Raised if data set is found to be invalid.""" def __init__(self, line): """Line that represents an invalid data set.""" super().__init__('Invalid format for line: {}'.format(line)) def parse_logs_and_generate_report(file_path): """Initialises the data sets and generates reports.
import os from functools import partial from logging_config import get_logger from sounds import sounds_config from sounds.audio_sound_pre_processing import prepare_audio_sound from sounds.ground_truth_processor import GroundtruthReader from sounds.model_labeler import ModelLabelEncoder from sounds.model_predictor import ModelPredictor import pandas as pd import numpy as np from sounds.model_structures import * from sounds.model_trainer import AudioFeaturesModel, train_and_test_model from pandas import DataFrame from utils.file_utils import return_from_path, save_object, load_object logger = get_logger(__name__) def save_features(groundtruth, path, dataset_name, filter_label=None): gtp = GroundtruthReader(f'{sounds_config.sounds_data_dir}/{groundtruth}') prepare_audio_sound_groundtruth = partial(prepare_audio_sound, gtp, filter_label) if not os.path.isdir(f'{sounds_config.sounds_data_dir}/{dataset_name}'): os.mkdir(f'{sounds_config.sounds_data_dir}/{dataset_name}') ftrs = return_from_path(prepare_audio_sound_groundtruth, path, sounds_config.extension) audio_sound_df = DataFrame(ftrs) save_object( audio_sound_df, f'{sounds_config.sounds_data_dir}/{dataset_name}/{os.path.basename(path)}.data' )
import numpy as np import pandas as pd from sklearn.preprocessing import MultiLabelBinarizer from sklearn.metrics.pairwise import cosine_similarity from logging_config import get_logger from utils import set_indexes pd.set_option('display.max_columns', None) scoring_logger = get_logger("ark.scoring") class Scoring: def __init__(self, data_dir): scoring_logger.info("starting __init__ for BulkScoring") self.bio_df = set_indexes(pd.read_csv( '{}/bio_df.csv'.format(data_dir))) self.pref_df = set_indexes( pd.read_csv('{}/pref_df.csv'.format(data_dir))) self.int_df = set_indexes(pd.read_csv( '{}/int_df.csv'.format(data_dir))) self.habit_df = set_indexes( pd.read_csv('{}/habit_df.csv'.format(data_dir))) self.pers_df = set_indexes( pd.read_csv('{}/pers_df.csv'.format(data_dir))) self.flat_df = set_indexes( pd.read_csv('{}/flat_df.csv'.format(data_dir))) self.extra_df = set_indexes( pd.read_csv('{}/extra_df.csv'.format(data_dir))) @staticmethod
class Version: """ A class used to represent a version string to enable semantic comparison between them. Attributes ---------- version : str Version string as it was inputted. __version_comparable : list[int] Version encoded as a list of ints to enable comparison. dev_stage_to_int_table : dict Lookup table for char to int conversion. Methods ------- _is_valid_operand(Version instance) Checks for comparison ops whether other obj is a Version instance. _get_int(char) Return a corresponding int for a char from a lookup table. Example ------- >>> a = Version("1.0.0") >>> b = Version("1.0.0-beta") >>> a > b True """ log = get_logger(__name__) dev_stage_to_int_table = { # alpha < beta < rc < final 0 "r": -1, # rc "c": -1, # rc "b": -2, # beta "a": -3, # alpha } # int_to_dev_stage_table = {v: k for k, v in dev_stage_to_int_table.items()} def __init__(self, version): self.version = str(version) self.__version_comparable = [ int(s) if s.isdigit() else self._get_int(s[0]) for s in re.findall("\d+|[a-z]+", self.version.lower()) ] # @property # lazy compute? # def version_comparable(self): # version_separated = re.findall("\d+|[a-z]+", self.version_lover) # return [int(s) if s.isdigit() else self._get_int(s[0]) for s in version_separated] def __str__(self): return self.version def __repr__(self): return f"Version instance {str(self.__version_comparable)}" @staticmethod def _is_valid_operand(other): return isinstance(other, Version) def _get_int(self, char): try: return self.dev_stage_to_int_table[char] except KeyError as error: msg = f"Unknown letter {char} in {self.version} REPLACED WITH Beta. " \ f"Known are {self.dev_stage_to_int_table.keys()}" self.log.warning(msg) return self.dev_stage_to_int_table['b'] def __eq__(self, other): if not self._is_valid_operand(other): raise NotImplemented for left, right in zip_longest(self.__version_comparable, other.__version_comparable, fillvalue=0): if left == right: continue else: return False else: return True def __lt__(self, other): if not self._is_valid_operand(other): raise NotImplemented for left, right in zip_longest(self.__version_comparable, other.__version_comparable, fillvalue=0): if left == right: continue elif left < right: return True elif left > right: return False else: return False
import pickle, warnings, uuid import os.path from google_auth_oauthlib.flow import InstalledAppFlow from google.auth.transport.requests import Request from googleapiclient.discovery import build import pandas as pd from logging_config import get_logger warnings.filterwarnings('ignore') pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) google_api_manager_logger = get_logger("ark.google_api_manager") def gsheet_api_check(SCOPES): google_api_manager_logger.info("running gsheet_api_check") creds = None if os.path.exists('token.pickle'): with open('token.pickle', 'rb') as token: creds = pickle.load(token) if not creds or not creds.valid: if creds and creds.expired and creds.refresh_token: creds.refresh(Request()) else: flow = InstalledAppFlow.from_client_secrets_file( 'credentials.json', SCOPES) creds = flow.run_local_server(port=0) with open('token.pickle', 'wb') as token: pickle.dump(creds, token) return creds
class Task4DbHandler(DbExecutor): """ Class to handle all possible interactions with database within the scope of a given task (Task4).. __init__ ---------- :Bool from_scratch: If True then DROP DATATABLE IF EXISTS, False -> use existing """ log = get_logger(__name__) def __init__(self, conncector: ConnectorBase, db_config: dict, from_scratch: bool = False, ): """ :Bool from_scratch: If True then DROP DATATABLE IF EXISTS, False -> use existing """ self.db_name = db_config.pop("database") super().__init__(conncector, db_config) if from_scratch: self.__drop_database() self.create_database() def __drop_database(self): query = f"DROP DATABASE IF EXISTS {self.db_name}" with self.cnx.cursor() as cursor: cursor.execute(query, ) self.log.warning(f"DROPPED {self.db_name} IF EXISTS") def create_database(self, ): query = f"CREATE DATABASE IF NOT EXISTS {self.db_name}" with self.cnx.cursor() as cursor: cursor.execute(query, ) query = f"USE {self.db_name}" with self.cnx.cursor() as cursor: cursor.execute(query, ) def create_table_rooms(self): query = """CREATE TABLE IF NOT EXISTS rooms ( id INT NOT NULL , name VARCHAR(12) NOT NULL ) """ with self.cnx.cursor() as cursor: cursor.execute(query, ) def create_table_students(self): query = """CREATE TABLE IF NOT EXISTS students ( birthday DATETIME NOT NULL, id INT NOT NULL, name VARCHAR(100) NOT NULL, room INT NOT NULL, sex enum('M', 'F') NOT NULL ) """ with self.cnx.cursor() as cursor: cursor.execute(query, ) def insert_room(self, dic: dict): query = """INSERT INTO rooms (id, name) VALUES (%(id)s, %(name)s) """ with self.cnx.cursor() as cursor: cursor.execute(query, dic) def insert_rooms(self, list_of_rooms: List[dict]): query = """INSERT INTO rooms (id, name) VALUES (%(id)s, %(name)s) """ with self.cnx.cursor() as cursor: cursor.executemany(query, list_of_rooms) def insert_student(self, dic: dict): query = """INSERT INTO students (birthday, id, name, room, sex) VALUES (%(birthday)s, %(id)s, %(name)s, %(room)s, %(sex)s) """ with self.cnx.cursor() as cursor: cursor.execute(query, dic) def insert_students(self, list_of_students: List[dict]): query = """INSERT INTO students (birthday, id, name, room, sex) VALUES (%(birthday)s, %(id)s, %(name)s, %(room)s, %(sex)s) """ with self.cnx.cursor() as cursor: cursor.executemany(query, list_of_students) def room_population(self): """список комнат и количество студентов в каждой из них""" query = """ SELECT r.id, COUNT(s.id) as population FROM rooms AS r LEFT JOIN students AS s ON r.id = s.room GROUP BY r.id ORDER BY population DESC """ with self.cnx.cursor(dictionary=True) as cursor: cursor.execute(query, ) return cursor.fetchall() def top5_least_average_age(self): """top 5 комнат, где самые маленький средний возраст студентов""" query = """ SELECT room, AVG(DATEDIFF(NOW(), birthday)) AS avg_age FROM students GROUP BY room ORDER BY avg_age ASC LIMIT 5 """ with self.cnx.cursor(dictionary=True) as cursor: cursor.execute(query, ) return cursor.fetchall() def top5_max_diff_age(self): """top 5 комнат с самой большой разницей в возрасте студентов""" query = """ WITH min_max AS ( SELECT room, MAX(birthday) AS max_age, MIN(birthday) AS min_age FROM students GROUP BY room ) SELECT room, MAX(DATEDIFF(max_age, min_age)) AS max_diff FROM min_max GROUP BY room ORDER BY max_diff DESC LIMIT 5 """ with self.cnx.cursor(dictionary=True) as cursor: cursor.execute(query, ) return cursor.fetchall() def mixed_sex(self): """список комнат где живут разнополые студенты""" query = """ SELECT room, COUNT(DISTINCT(sex)) AS sex_dst FROM students GROUP BY room HAVING sex_dst > 1 """ with self.cnx.cursor(dictionary=True) as cursor: cursor.execute(query, ) return cursor.fetchall() def add_indexes(self): """добавить использование PRIMERY KEY и FOREIGN KEY в таблицы""" query = """ ALTER TABLE rooms ADD PRIMARY KEY (id) """ with self.cnx.cursor() as cursor: cursor.execute(query, ) query = """ ALTER TABLE students ADD PRIMARY KEY (id), ADD FOREIGN KEY (room) REFERENCES rooms (id) """ with self.cnx.cursor() as cursor: cursor.execute(query, )
# import json import os from typing import Dict, List, Iterable from category_class import Category from logging_config import get_logger _logger = get_logger(logger_name=__name__) def log_too_many_elements_in_def(parsed_entry: Dict[str, List[str]], definition_tag: str, ): try: defini = parsed_entry[definition_tag] except KeyError: pass else: if len(defini) > 1: _logger.debug("Too Many elements in Definition: " + str(defini)) def log_missing_language(parsed_entry: Dict[str, List[str]], first_language_tag: str, second_language_tag: str, ): try: parsed_entry[first_language_tag] parsed_entry[second_language_tag]
import smtplib, ssl from credentials import email_address, password from email import encoders from email.mime.text import MIMEText from email.mime.multipart import MIMEMultipart from email.mime.base import MIMEBase import pandas as pd from logging_config import get_logger from utils import set_indexes email_manager_logger = get_logger("ark.email_manager") class EmailManager: def __init__(self, name, csv_filename): email_manager_logger.info('starting __init__ for EmailManager') self.name = name self.port = 465 # For SSL self.smtp_server = "smtp.gmail.com" self.sender = email_address self.password = password self.csv_filename = csv_filename self.contact_df = set_indexes( pd.read_csv('data_tables/contact_df.csv')) self.rec_email = self.contact_df.loc[self.name, 'Contact: Email address'] def create_message(self): try: email_manager_logger.info("starting create_message")
import pandas as pd import numpy as np from logging_config import get_logger from utils import set_indexes pd.set_option('display.max_columns', None) user_app_logger = get_logger("ark.user_app") class UserApp: def __init__(self, name, score_dict, data_dir): user_app_logger.info("starting __init__ for UserApp") self.name = name self.user_info = score_dict["user_info"].loc[ score_dict["user_info"].index == name] self.bio_score_cos_matrix = score_dict["bio_score_cos_matrix"][[ self.name ]] self.pref_matrix = score_dict["pref_matrix"][ score_dict["pref_matrix"]["pref_name_x"] == self.name] self.interest_score_cos_matrix = score_dict[ "interest_score_cos_matrix"][[self.name]] self.habit_score_cos_matrix = score_dict["habit_score_cos_matrix"][[ self.name ]] self.personality_score_cos_matrix = score_dict[ "personality_score_cos_matrix"][[self.name]] self.flat_info = score_dict["flat_info"] self.contact_df = set_indexes( pd.read_csv('{}/contact_df.csv'.format(data_dir)))