# blog_parser.py from util import get_log, bannerfy from paragraph import Paragraph, Paragraphs, ParagraphsAction from machine_html_parser import State, Attrs from machine_html_parser import TransitionData, MachineHTMLParser from typing import List, Iterable, Tuple, Callable, Optional from datetime import datetime from functools import reduce from pathlib import Path from logging import DEBUG log = get_log(__file__, stderr=True) log.setLevel(DEBUG) # State Transition Diagram # # start ---> metadata <---> author # | <---> date # | # ---> article <---> subtitle # | # ---> done valid_transitions: Iterable[Tuple[State, State]] = set([ ('start', 'metadata'), ('metadata', 'title'), ('title', 'metadata'), ('metadata', 'author_1'), ('author_1', 'author_2'),
import arrow import asyncio import logging import traceback import requests import threading from concurrent.futures import ThreadPoolExecutor requests.packages.urllib3.disable_warnings() dir_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.append(dir_root) from exchange.exchange_trade import exchange_trade from arbitrage.stat_arbitrage import stat_arbitrage import conf.conf_aliyun import conf import util logger = util.get_log(__name__) uesrid = 0 ex1_id = 'binance' ex2_id = 'okex' symbol = 'EOS/BTC' ex1 = exchange_trade.create(uesrid, ex1_id) ex2 = exchange_trade.create(uesrid, ex2_id) sa = stat_arbitrage(symbol, ex1, ex2) sa.rebalance_set(True, 0.5) tasks = sa.add_async_task() pending = asyncio.Task.all_tasks()
from paragraph import Paragraph, Paragraphs, ParagraphsAction from util import get_log, word_count import numpy as np # type: ignore from typing import Iterable, List, Dict from datetime import datetime, timedelta from logging import DEBUG Samples = List[float] Label = str Stats = Dict[Label, Samples] # logging log = get_log(__file__, stderr=True, mode='w') # mode 'w' to overwrite log.setLevel(DEBUG) def log_info(paragraph: Paragraph, length: float, time: float): id_ = f'{paragraph.filename}|{paragraph.paragraph_title}' log.info(f'[length:{length:8.0f}, time:{time:8.2f}] {id_}') # formatting class StatsFmt: fields: List[str] = ['min', 'max', 'mean', 'std', 'sum'] width: int = 7 precision: int = 2
# -*- coding: utf-8 -*- # # PostGIS support wrapper. # # Author: Just van den Broecke # from util import get_log log = get_log("postgis") try: import psycopg2 import psycopg2.extensions except ImportError: log.error( "cannot find package psycopg2 for Postgres client support, please install psycopg2 first!" ) # sys.exit(-1) class PostGIS: def __init__(self, config): # Lees de configuratie self.config = config def connect(self): try: conn_str = "dbname=%s user=%s host=%s port=%s" % ( self.config['database'], self.config['user'], self.config.get('host', 'localhost'), self.config.get('port', '5432'))
from paragraph_stats import ParagraphStatsCollector from util import bannerfy, get_log, input_command, get_new_name from middlewares import Middlewares, pa_log, pa_sanitize_ws, pa_chunk_long from middlewares import pa_remove_empty, pa_cat_short, pa_remove_ptag from es_middleware import ESMiddleware from es_config import ES_CONFIG, my_analyzer, my_analysis, JsonObject from elasticsearch import Elasticsearch # type: ignore from pprint import pprint, pformat from glob import glob from logging import DEBUG from typing import Dict from itertools import chain log = get_log(__file__, stderr=True, mode='w') log.setLevel(DEBUG) class BlogIndexConfig(ES_CONFIG): index: str = 'site' # the only reason this is a property is that it is a bit convoluted to # create @property def mappings(self): default_prop: Dict[str, str] = { 'type': 'text', 'analyzer': 'my_analyzer' } properties: Iterable[str] = list(
import sys import os import util import argparse import data_manager import website_rating_model logger = util.get_log('website_rating') def main(): parser = argparse.ArgumentParser(description='Websites Dynamic Rating') parser.add_argument('-b', '--basepath', help='The base path to store the data set', type=str, required=False, default=str( os.path.join(os.path.expanduser('~'), 'dataset')), dest='basepath') parser.add_argument('-i', '--init', action='store_true', default=False, dest='init', help='initail environment') parser.add_argument('-u', '--pages', action='store_true', default=False, dest='pages',
import os import random import sqlite3 import sys import uuid from util import get_log log = get_log('critical') class DataStore(object): # MAINTENANCE def get_db_path(self): node_id = self.node_id port_int = self.port port_temp = str(port_int) port = port_temp[:-1] filename = f'abrim_{node_id}_{port}.sqlite' try: # noinspection PyUnresolvedReferences import appdirs udd = appdirs.user_data_dir("abrim", "abrim_node") db_path = os.path.join(udd, filename) if not os.path.exists(udd): os.makedirs(udd) except ImportError: try: db_path = f".{os.path.basename(sys.modules['__main__'].__file__)}{filename}" except AttributeError:
# get configuration conf = get_config("configs.yaml") MAX_CONTENT_LENGTH = conf["config"]["MAX_CONTENT_LENGTH"] # implement rate limit limiter = Limiter(app, key_func=get_remote_address, default_limits=[ conf["config"]["limiter"]["day"], conf["config"]["limiter"]["hour"], conf["config"]["limiter"]["second"], ]) # get logger logobj = get_log() @app.route('/sorting', methods=['POST']) @limit_content_length(MAX_CONTENT_LENGTH) def sort_list(): """ This route receives a list of int numbers as data, and an order for sorting :return: return a sorted list of data consisting of integer numbers """ if not request.json: logobj.error("Empty request.json") abort(400) if not all([request.json.get('data'), request.json.get('order')]): logobj.error("KeyError, no data and no order supplied")
import os import pathlib from sklearn.datasets import load_files from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.pipeline import Pipeline from sklearn import metrics from sklearn.model_selection import GridSearchCV from sklearn.linear_model import SGDClassifier from sklearn.naive_bayes import MultinomialNB from sklearn.naive_bayes import BernoulliNB from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier logger = util.get_log('website_rating_model') class WebsiteRatingModel: def __init__(self, trian_container, valid_container): if trian_container: self.trian_container = trian_container else: self.trian_container = os.path.join(str(pathlib.Path.home()), 'train') if valid_container: self.valid_container = valid_container else: self.valid_container = os.path.join(str(pathlib.Path.home()), 'valid')
# -*- coding: utf-8 -*- # # PostGIS support wrapper. # # Author: Just van den Broecke # from util import get_log log = get_log("postgis") try: import psycopg2 import psycopg2.extensions except ImportError: log.error("cannot find package psycopg2 for Postgres client support, please install psycopg2 first!") # sys.exit(-1) class PostGIS: def __init__(self, config): # Lees de configuratie self.config = config def connect(self): try: conn_str = "dbname=%s user=%s host=%s port=%s" % ( self.config["database"], self.config["user"], self.config.get("host", "localhost"), self.config.get("port", "5432"), )
# #using selenium to download all website pages. # from selenium import webdriver from bs4 import BeautifulSoup import time import os import util import numpy as np logger = util.get_log('data_manager') class DataManager: def __init__(self, base_path): if not base_path: base_path = str(os.path.join(os.path.expanduser('~'), 'dataset')) self.pages_folder_path = os.path.join(base_path, 'pages') self.train_path = os.path.join(base_path, 'train') self.valid_path = os.path.join(base_path, 'valid') self.test_path = os.path.join(base_path, 'test') self.urls_folder_path = os.path.join(base_path, 'urls') self.map_category_number_to_name = {} self.map_category_to_train_number = {} self.map_category_to_valid_number = {} self.category = 0 self.train_number = 0 self.valid_number = 0