def __init__(self): self.log = common.Logger() self.settings = common.Settings() self.comment_generator = common.CommentGenerator( ) # генератор комментариев self.browser = Browser('chrome') self.subscriptions = [] # наши подписки self.comments = common.Comments2() # база данных комментариев common.Subscription.url_list = [] # массив наших подписок self.sleep_time_after_visit = 5 self.our_channel_url = u'https://www.youtube.com/channel/'.format( self.settings.get_parameter('address')) # наш канал self.max_subscribers_amount = 1000 # подписываемся если количество подписчиком меньше этого числа with open( 'channels.txt', 'r' ) as f: # файл каналов с которых берем спосок каналов для подписки buffer = f.read() self.channels_list = buffer.split() self.channels_list = filter(bool, self.channels_list) self.channels_list = filter(lambda x: not x[0] == '#', self.channels_list) #self.channels_list = [x for x in self.channels_list if not x[0] == '#'] self.all_channel_mode = True self.re_is_cyrillic = regex.compile('[\p{IsCyrillic}]', regex.UNICODE) self.comment_not_russian = 'not russian title!' self.comment_errors_counter = 0
class MovieClassifier: # Logger logger = common.Logger() clf = None # Class Movie Classifier def __init__(self): self.logger.log("MovieClassifier ------------------------------------ Init") self.clf = pickle.load(open('movieclassifier/pkl_objects/classifier.pkl', 'rb')) def classify(self, _id,review_text): self.logger.log("_id: "+_id) X = vect.transform([review_text]) y = self.clf.predict(X)[0] proba = np.max(self.clf.predict_proba(X)) self.logger.log("y: " + str(y)) self.logger.log("proba: " + str(proba)) return {'y':int(y),'proba':proba} def train(self,_id,review_text,y): self.logger.log("_id: " + _id) X = vect.transform([review_text]) classes = np.array([0, 1]) self.clf.partial_fit(X, [y],classes=classes)
def test_logger(self): file_name = str(uuid.uuid4()) logger = common.Logger(log_file_name=file_name, log_mode='a') logger.log_message('TRACKS', 'Hello') logger.log_message('Client', 'Hello Tracks') with open(os.path.join(os.path.join(common.project_root(), 'logs'), file_name + '.log')) as f: f = f.readlines() self.assertEqual(True, 'TRACKS' in f[0]) self.assertEqual(True, 'Client' in f[1]) self.assertEqual(True, 'TRACKS' not in f[1]) self.assertEqual(True, 'Tracks' in f[1])
def train(env, config, outputs=None): logdir = pathlib.Path(config.logdir).expanduser() logdir.mkdir(parents=True, exist_ok=True) config.save(logdir / 'config.yaml') print(config, '\n') print('Logdir', logdir) outputs = outputs or [ common.TerminalOutput(), common.JSONLOutput(config.logdir), common.TensorBoardOutput(config.logdir), ] replay = common.Replay(logdir / 'train_episodes', **config.replay) step = common.Counter(replay.stats['total_steps']) logger = common.Logger(step, outputs, multiplier=config.action_repeat) metrics = collections.defaultdict(list) should_train = common.Every(config.train_every) should_log = common.Every(config.log_every) should_video = common.Every(config.log_every) should_expl = common.Until(config.expl_until) def per_episode(ep): length = len(ep['reward']) - 1 score = float(ep['reward'].astype(np.float64).sum()) print(f'Episode has {length} steps and return {score:.1f}.') logger.scalar('return', score) logger.scalar('length', length) for key, value in ep.items(): if re.match(config.log_keys_sum, key): logger.scalar(f'sum_{key}', ep[key].sum()) if re.match(config.log_keys_mean, key): logger.scalar(f'mean_{key}', ep[key].mean()) if re.match(config.log_keys_max, key): logger.scalar(f'max_{key}', ep[key].max(0).mean()) if should_video(step): for key in config.log_keys_video: logger.video(f'policy_{key}', ep[key]) logger.add(replay.stats) logger.write() env = common.GymWrapper(env) env = common.ResizeImage(env) if hasattr(env.act_space['action'], 'n'): env = common.OneHotAction(env) else: env = common.NormalizeAction(env) env = common.TimeLimit(env, config.time_limit) driver = common.Driver([env]) driver.on_episode(per_episode) driver.on_step(lambda tran, worker: step.increment()) driver.on_step(replay.add_step) driver.on_reset(replay.add_step) prefill = max(0, config.prefill - replay.stats['total_steps']) if prefill: print(f'Prefill dataset ({prefill} steps).') random_agent = common.RandomAgent(env.act_space) driver(random_agent, steps=prefill, episodes=1) driver.reset() print('Create agent.') agnt = agent.Agent(config, env.obs_space, env.act_space, step) dataset = iter(replay.dataset(**config.dataset)) train_agent = common.CarryOverState(agnt.train) train_agent(next(dataset)) if (logdir / 'variables.pkl').exists(): agnt.load(logdir / 'variables.pkl') else: print('Pretrain agent.') for _ in range(config.pretrain): train_agent(next(dataset)) policy = lambda *args: agnt.policy(*args, mode='explore' if should_expl(step) else 'train') def train_step(tran, worker): if should_train(step): for _ in range(config.train_steps): mets = train_agent(next(dataset)) [metrics[key].append(value) for key, value in mets.items()] if should_log(step): for name, values in metrics.items(): logger.scalar(name, np.array(values, np.float64).mean()) metrics[name].clear() logger.add(agnt.report(next(dataset))) logger.write(fps=True) driver.on_step(train_step) while step < config.steps: logger.write() driver(policy, steps=config.eval_every) agnt.save(logdir / 'variables.pkl')
from typing import List import common import dc import mc import db #initialization of Database database = db.Database("data.db") #initialization of Tables database.init_table( "admins", { "user_id":str, } ) #initialize common objects config = common.Config() logger = common.Logger() bot = dc.Bot(config, logger) servers: List[mc.Server] = []
class StudyBuddyAI: # Logger logger = common.Logger() bidaf_model = None predictor = None tfidf_vectorizer = TfidfVectorizer() tfidf_matrix = None context_list = None all_tfidf_vectorizer = TfidfVectorizer() all_tfidf_matrix = None all_context_list = None # Trained Models trained_models = [{ 'name': 'Base Model (9/15/2017)', 'path': '../../allennlp/train_out/bidaf-model-2017.09.15-charpad.tar.gz' }, { 'name': 'ReTrained Model 1 (12/9/2017)', 'path': '../../allennlp/train_out/model01.tar.gz' }, { 'name': 'ReTrained Model 2 (12/10/2017)', 'path': '../../allennlp/train_out/model02.tar.gz' }, { 'name': 'ReTrained Model 3 (12/11/2017)', 'path': '../../allennlp/train_out/model03.tar.gz' }, { 'name': 'ReTrained Model 4 (12/12/2017)', 'path': '../../allennlp/train_out/model04.tar.gz' }, { 'name': 'ReTrained Model 5 (12/13/2017)', 'path': '../../allennlp/train_out/model05.tar.gz' }] # Context Memory Settings context_memory_time = 1 # in minutes context_memory_size = 5 context_memory = [] context_qa = [] # Class StudyBuddyAI def __init__(self): self.logger.log( "StudyBuddyAI ------------------------------------ Init") # Load pretrained model self.load_trained_model('../../allennlp/train_out/model05.tar.gz') def get_trained_model_list(self): return self.trained_models def load_trained_model(self, path): self.logger.log("Loading model: " + path) self.bidaf_model = DemoModel(path, 'machine-comprehension') # predictor self.predictor = self.bidaf_model.predictor() def save_in_context_memory(self, context): # Save the context self.context_memory.insert(0, context) if len(self.context_memory) > self.context_memory_size: # ensure our context list is limited self.context_memory = self.context_memory[:self. context_memory_size] def save_qa_in_context_memory(self, qa): # Save the context self.context_qa.insert(0, qa) def clear_context_memory(self): self.context_memory = [] self.context_qa = [] def get_context_memory(self): return { 'context_memory': self.context_memory, 'context_qa': self.context_qa } def load_tfidf_vectorizer(self, context_list, all=False): corpus = list() if all == True: self.all_context_list = context_list else: self.context_list = context_list for context in context_list: # Tokenize tokens = self.tokenize_text(context) cleaned_context_text = ' '.join(tokens) corpus.append(cleaned_context_text) # Tf–idf term weighting using TfidfVectorizer if all == True: self.all_tfidf_matrix = self.all_tfidf_vectorizer.fit_transform( corpus) else: self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(corpus) def predict_from_passage(self, data): prediction = self.predictor.predict_json(data) self.logger.log(prediction) return prediction def predict_for_title(self, question, all=False, check_context=False): passage = '' current_context_list = [] current_context_start_index = [] current_context_end_index = [] # if we need to look at the context only if (check_context == True) and (len(self.context_memory) > 0): # the top context item current_context_list = self.context_memory[:1] else: # Tokenize tokens = self.tokenize_text(question) cleaned_context_text = ' '.join(tokens) if all == False: question_vector = self.tfidf_vectorizer.transform( [cleaned_context_text]) else: question_vector = self.all_tfidf_vectorizer.transform( [cleaned_context_text]) # Find Cosine Similarity of question with the contexts if all == False: cs = cosine_similarity(question_vector, self.tfidf_matrix) else: cs = cosine_similarity(question_vector, self.all_tfidf_matrix) self.logger.log(cs) cs_list = cs[0] idx = 0 threshold = 0.25 values_greater_than_zero = [i for i in cs_list if i > 0.0] if len(values_greater_than_zero) == 0: return {'status': 0} # for ctx in self.context_memory: # current_context_start_index.append(len(passage)) # passage = passage + ctx + ' ' # current_context_list.append(ctx) # current_context_end_index.append(len(passage)) # else: min_value = min(values_greater_than_zero) max_value = max(cs_list) range = max_value - min_value threshold = max_value - range / 3 for cs_val in cs_list: if cs_val >= threshold: if all == False: current_context_list.append(self.context_list[idx]) else: current_context_list.append(self.all_context_list[idx]) idx = idx + 1 # build passage for txt in current_context_list: current_context_start_index.append(len(passage)) passage = passage + txt + ' ' current_context_end_index.append(len(passage)) data = {} data['question'] = question data['passage'] = passage # Build the return object result = {} result['status'] = 1 result['prediction'] = self.predict_from_passage(data) result['current_context_list'] = current_context_list # print(current_context_start_index) # print(current_context_end_index) # print(current_context_list) # print(passage) # Save the context from which answer was predicted from # best_span = result['prediction']['best_span'] # for idx, ctx in enumerate(current_context_end_index): # if (best_span[0] >= current_context_start_index[idx]) and (best_span[1] <= current_context_end_index[idx]): # self.save_in_context_memory(current_context_list[idx],{'question':question,'answer':result['prediction']['best_span_str']}) # result['current_context'] = current_context_list[idx] # continue; best_span_str = result['prediction']['best_span_str'] for ctx in current_context_list: if best_span_str in ctx: self.save_in_context_memory(ctx) self.save_qa_in_context_memory({ 'question': question, 'answer': best_span_str }) result['current_context'] = ctx continue # return the current context memory result['context_memory'] = self.context_memory result['context_qa'] = self.context_qa return result # Helper Methods # Tokenize text using NLTK def tokenize_text(self, text, remove_stop_words=True, stem_words=True, filter_short_token=1): # split into words words = nltk.word_tokenize(text) # convert to lower case words = [w.lower() for w in words] # prepare regex for char filtering re_punc = re.compile( '[%s]' % re.escape(string.punctuation)) # remove punctuation from each word tokens = [re_punc.sub('', w) for w in words] # remove not alphabets tokens = [word for word in tokens if word.isalpha()] # filter out stop words if remove_stop_words == True: stop_words = set(nltk.corpus.stopwords.words('english')) tokens = [w for w in tokens if not w in stop_words] # Perfomring if stem_words == True: # stemming of words porter = nltk.stem.porter.PorterStemmer() tokens = [porter.stem(word) for word in tokens] # filter out short tokens if filter_short_token > 0: tokens = [ word for word in tokens if len(word) > filter_short_token ] return tokens
if mask & selectors.EVENT_WRITE: if data.w_bytes: print("ECHO: " + str(data.w_bytes) + " to " + str(data.addr)) sent = soc.send(b"Returning " + data.w_bytes) data.w_bytes = data.w_bytes[sent:] return True #TODO: #if socket drop count too high: #log or send email #send email if __name__ == "__main__": status_log = common.Logger(config.BASE_DIR + config.MODULES_LOG_FILE) sel = selectors.DefaultSelector() s_soc = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s_soc.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) s_soc.bind((HOST, PORT)) s_soc.listen(N_MODULES) s_soc.setblocking(False) sel.register(s_soc, selectors.EVENT_READ) connections = [s_soc] while True: events = sel.select() for key, mask in events: if key.data is None:
# Quick info : Cross-origin resource sharing (CORS) is a World Wide Web Consortium (W3C) specification (commonly considered part of HTML5) that lets JavaScript overcome the same-origin policy security restriction imposed by browsers # Project Python files import textsummarizer as ts import movieclassifier as mc import appconfig as config import common # Initialization service = Flask(__name__) CORS(service) #This package exposes a Flask extension which by default enables CORS support on all routes, for all origins and methods. It allows parameterization of all CORS headers on a per-resource level. The package also contains a decorator, for those who prefer this approach. api = Api(service) #Flask API is a drop-in replacement for Flask that provides an implementation of browsable APIs similar to what Django REST framework provides. It gives you properly content negotiated-responses and smart request parsing: logger = common.Logger() #some logging functionality settings = config.Settings() #appconfig.py client = MongoClient(settings.mongodb) db = client[settings.mongodb_database] fs = gridfs.GridFS( db ) #GridFS is a specification for storing and retrieving files that exceed the BSON-document size limit of 16 MB. Instead of storing a file in a single document, GridFS divides the file into parts, or chunks [1], and stores each chunk as a separate document. By default, GridFS uses a default chunk size of 255 kB; # custom serializer which copes with the ObjectIds class JSONEncoder(json.JSONEncoder): def default(self, o): if isinstance(o, bson.ObjectId): return str(o) return json.JSONEncoder.default(self, o)
import numpy as np from sklearn.cluster import MiniBatchKMeans, KMeans from sklearn.feature_extraction.text import TfidfVectorizer import common from pre_clustering import init_centroids vectorizer: TfidfVectorizer km: KMeans logger = common.Logger(__name__) def analyze(n_preview=10): global vectorizer, km # Encode: logger.info('Encoding...') vectorizer = TfidfVectorizer(max_df=0.5, max_features=common.n_features, min_df=2, stop_words='english') common.X = vectorizer.fit_transform(common.doc_texts) common.save_pickle(vectorizer, 'vectorizer.pickle') common.vocab = np.array(vectorizer.get_feature_names()) logger.info(f'X: {common.X.shape}') common.save_encoded_vocab() logger.info('Clustering...') # km = MiniBatchKMeans(n_clusters=common.n_topics, init=init_centroids(), init_size=1000, batch_size=1000, # verbose=0, random_state=common.random_seed) # km = MiniBatchKMeans(n_clusters=common.n_topics, verbose=1, random_state=1)
class TextSummarizer: # Logger logger = common.Logger() # Class Text Summarizer def __init__(self): self.logger.log( "TextSummarizer ------------------------------------ Init") def summarize(self, _id, content_text, word_limit): self.logger.log("_id: " + _id) self.logger.log("word_limit: " + str(word_limit)) # File names path_stage0 = 'process/' + _id + '.json' path_stage1 = 'process/' + _id + '_o1.json' path_stage2 = 'process/' + _id + '_o2.json' path_stage3 = 'process/' + _id + '_o3.json' path_stage4 = 'process/' + _id + '_o4.json' # Create input file with open(path_stage0, 'w') as outfile: json.dump({"id": "123", "text": content_text}, outfile) # Statistical Parsing - Stage 1 # Perform statistical parsing/tagging on a document in JSON format with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc( pytextrank.json_iter(path_stage0)): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) # Ranked Keyphrases - Stage 2 # Collect and normalize the key phrases from a parsed document graph, ranks = pytextrank.text_rank(path_stage1) pytextrank.render_ranks(graph, ranks) with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) # Extractive Summarization - Stage 3 # Calculate a significance weight for each sentence, using MinHash to approximate a Jaccard distance from key phrases determined by TextRank kernel = pytextrank.rank_kernel(path_stage2) with open(path_stage3, 'w') as f: for s in pytextrank.top_sentences(kernel, path_stage1): f.write(pytextrank.pretty_print(s._asdict())) f.write("\n") # Final Output - Stage 4 # Summarize a document based on most significant sentences and key phrases phrases = ", ".join( set([ p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=12) ])) sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=word_limit), key=lambda x: x[1]) s = [] for sent_text, idx in sent_iter: s.append(pytextrank.make_sentence(sent_text)) graf_text = " ".join(s) return {'excerpts': graf_text, 'keywords': phrases}
def main(): configs = yaml.safe_load(( pathlib.Path(sys.argv[0]).parent / 'configs.yaml').read_text()) parsed, remaining = common.Flags(configs=['defaults']).parse(known_only=True) config = common.Config(configs['defaults']) for name in parsed.configs: config = config.update(configs[name]) config = common.Flags(config).parse(remaining) logdir = pathlib.Path(config.logdir).expanduser() logdir.mkdir(parents=True, exist_ok=True) config.save(logdir / 'config.yaml') print(config, '\n') print('Logdir', logdir) import tensorflow as tf tf.config.experimental_run_functions_eagerly(not config.jit) message = 'No GPU found. To actually train on CPU remove this assert.' assert tf.config.experimental.list_physical_devices('GPU'), message for gpu in tf.config.experimental.list_physical_devices('GPU'): tf.config.experimental.set_memory_growth(gpu, True) assert config.precision in (16, 32), config.precision if config.precision == 16: from tensorflow.keras.mixed_precision import experimental as prec prec.set_policy(prec.Policy('mixed_float16')) train_replay = common.Replay(logdir / 'train_episodes', **config.replay) eval_replay = common.Replay(logdir / 'eval_episodes', **dict( capacity=config.replay.capacity // 10, minlen=config.dataset.length, maxlen=config.dataset.length)) step = common.Counter(train_replay.stats['total_steps']) outputs = [ common.TerminalOutput(), common.JSONLOutput(logdir), common.TensorBoardOutput(logdir), ] logger = common.Logger(step, outputs, multiplier=config.action_repeat) metrics = collections.defaultdict(list) should_train = common.Every(config.train_every) should_log = common.Every(config.log_every) should_video_train = common.Every(config.eval_every) should_video_eval = common.Every(config.eval_every) should_expl = common.Until(config.expl_until // config.action_repeat) def make_env(mode): suite, task = config.task.split('_', 1) if suite == 'dmc': env = common.DMC( task, config.action_repeat, config.render_size, config.dmc_camera) env = common.NormalizeAction(env) elif suite == 'atari': env = common.Atari( task, config.action_repeat, config.render_size, config.atari_grayscale) env = common.OneHotAction(env) elif suite == 'crafter': assert config.action_repeat == 1 outdir = logdir / 'crafter' if mode == 'train' else None reward = bool(['noreward', 'reward'].index(task)) or mode == 'eval' env = common.Crafter(outdir, reward) env = common.OneHotAction(env) else: raise NotImplementedError(suite) env = common.TimeLimit(env, config.time_limit) return env def per_episode(ep, mode): length = len(ep['reward']) - 1 score = float(ep['reward'].astype(np.float64).sum()) print(f'{mode.title()} episode has {length} steps and return {score:.1f}.') logger.scalar(f'{mode}_return', score) logger.scalar(f'{mode}_length', length) for key, value in ep.items(): if re.match(config.log_keys_sum, key): logger.scalar(f'sum_{mode}_{key}', ep[key].sum()) if re.match(config.log_keys_mean, key): logger.scalar(f'mean_{mode}_{key}', ep[key].mean()) if re.match(config.log_keys_max, key): logger.scalar(f'max_{mode}_{key}', ep[key].max(0).mean()) should = {'train': should_video_train, 'eval': should_video_eval}[mode] if should(step): for key in config.log_keys_video: logger.video(f'{mode}_policy_{key}', ep[key]) replay = dict(train=train_replay, eval=eval_replay)[mode] logger.add(replay.stats, prefix=mode) logger.write() print('Create envs.') num_eval_envs = min(config.envs, config.eval_eps) if config.envs_parallel == 'none': train_envs = [make_env('train') for _ in range(config.envs)] eval_envs = [make_env('eval') for _ in range(num_eval_envs)] else: make_async_env = lambda mode: common.Async( functools.partial(make_env, mode), config.envs_parallel) train_envs = [make_async_env('train') for _ in range(config.envs)] eval_envs = [make_async_env('eval') for _ in range(eval_envs)] act_space = train_envs[0].act_space obs_space = train_envs[0].obs_space train_driver = common.Driver(train_envs) train_driver.on_episode(lambda ep: per_episode(ep, mode='train')) train_driver.on_step(lambda tran, worker: step.increment()) train_driver.on_step(train_replay.add_step) train_driver.on_reset(train_replay.add_step) eval_driver = common.Driver(eval_envs) eval_driver.on_episode(lambda ep: per_episode(ep, mode='eval')) eval_driver.on_episode(eval_replay.add_episode) prefill = max(0, config.prefill - train_replay.stats['total_steps']) if prefill: print(f'Prefill dataset ({prefill} steps).') random_agent = common.RandomAgent(act_space) train_driver(random_agent, steps=prefill, episodes=1) eval_driver(random_agent, episodes=1) train_driver.reset() eval_driver.reset() print('Create agent.') train_dataset = iter(train_replay.dataset(**config.dataset)) report_dataset = iter(train_replay.dataset(**config.dataset)) eval_dataset = iter(eval_replay.dataset(**config.dataset)) agnt = agent.Agent(config, obs_space, act_space, step) train_agent = common.CarryOverState(agnt.train) train_agent(next(train_dataset)) if (logdir / 'variables.pkl').exists(): agnt.load(logdir / 'variables.pkl') else: print('Pretrain agent.') for _ in range(config.pretrain): train_agent(next(train_dataset)) train_policy = lambda *args: agnt.policy( *args, mode='explore' if should_expl(step) else 'train') eval_policy = lambda *args: agnt.policy(*args, mode='eval') def train_step(tran, worker): if should_train(step): for _ in range(config.train_steps): mets = train_agent(next(train_dataset)) [metrics[key].append(value) for key, value in mets.items()] if should_log(step): for name, values in metrics.items(): logger.scalar(name, np.array(values, np.float64).mean()) metrics[name].clear() logger.add(agnt.report(next(report_dataset)), prefix='train') logger.write(fps=True) train_driver.on_step(train_step) while step < config.steps: logger.write() print('Start evaluation.') logger.add(agnt.report(next(eval_dataset)), prefix='eval') eval_driver(eval_policy, episodes=config.eval_eps) print('Start training.') train_driver(train_policy, steps=config.eval_every) agnt.save(logdir / 'variables.pkl') for env in train_envs + eval_envs: try: env.close() except Exception: pass
def check_system_uptime(logger): uptime = -1 try: with open("/proc/uptime", "r") as procfs: line = procfs.readline() uptime = int(float(line.split(" ")[0])) except Exception as e: logger.add_log("ERROR: Failed to read system uptime, " + str(e)) return uptime if __name__ == "__main__": status_log = common.Logger(config.BASE_DIR + config.MONITOR_LOG_FILE) previous_uptime = 0 while True: ip_check = check_external_ip(status_log) if config.DEBUG: print("Checking external IPv4 address...") if ip_check: print("SUCCESS: Address updated.") else: print("FAILURE: Could not update address.") uptime = check_system_uptime(status_log) if config.DEBUG: print("\nChecking system uptime...") if uptime < 0: