Example #1
0
    def __init__(self,
                 url: str,
                 method: str = 'GET',
                 *,
                 callback=None,
                 load_js: bool = False,
                 metadata: dict = None,
                 headers: dict = None,
                 request_config: dict = None,
                 request_session=None,
                 res_type: str = 'text',
                 **kwargs):
        """
        Initialization parameters
        """
        self.url = url
        self.method = method.upper()
        if self.method not in self.METHOD:
            raise ValueError('%s method is not supported' % self.method)

        self.callback = callback
        self.load_js = load_js
        self.headers = headers
        self.metadata = metadata if metadata is not None else {}
        self.request_session = request_session
        if request_config is None:
            self.request_config = self.REQUEST_CONFIG
        else:
            self.request_config = request_config
        self.res_type = res_type
        self.kwargs = kwargs

        self.close_request_session = False
        self.logger = get_logger(name=self.name)
        self.retry_times = self.request_config.get('RETRIES', 3)
Example #2
0
    def test_logfile_created(self):
        from tempfile import mkdtemp
        from core.utils import get_logger
        from os.path import isfile

        filename = "%s/sourcerer.log" % mkdtemp() 
        logger = get_logger('test', filename)

        if not isfile(filename):
            self.fail("Log file %s not created by get_logger()" % filename)
Example #3
0
    def test_log_filename_not_writeable(self):
        from core.utils import get_logger

        filename = "%s/sourcerer.log" % self._get_nonexistant_directory_name()

        try:
            logger = get_logger('test', filename)
        except IOError:
            # We should catch the IO error in get_logger()
            self.fail("IOError exception not raised for nonexistant logfile directory.")
Example #4
0
 def __init__(self, middleware, loop=None):
     if not self.start_urls or not isinstance(self.start_urls, list):
         raise ValueError(
             "Spider must have a param named start_urls, eg: start_urls = ['https://www.github.com']"
         )
     self.logger = get_logger(name=self.name)
     self.loop = loop or asyncio.new_event_loop()
     asyncio.set_event_loop(self.loop)
     self.request_queue = asyncio.Queue()
     self.sem = asyncio.Semaphore(getattr(self, 'concurrency', 3))
     self.middleware = middleware or Middleware()
Example #5
0
    def __init__(self, bucket, resource='s3'):
        self.bucket = bucket
        self.app_logger = get_logger('app')

        if 'session' not in self.session or self.session[
                'expire'] < datetime.utcnow():
            self.session.update({
                'session':
                boto3.Session(
                    aws_access_key_id=configs.AWS_ACCESS_KEY_ID,
                    aws_secret_access_key=configs.AWS_SECRET_ACCESS_KEY),
                'expire':
                datetime.utcnow() + timedelta(hours=1)
            })
        if resource not in self.clients:
            self.clients.update(
                {resource: self.session['session'].client(resource)})
Example #6
0
    def test_log_error(self):
        from core.utils import get_logger
        from os.path import isfile
        from tempfile import mkdtemp

        filename = "%s/sourcerer.log" % mkdtemp() 

        logger = get_logger('test', filename)

        error_msg = "This is an error."

        logger.error(error_msg)

        f = open(filename)
        line = f.readline()

        # The most recent log entry in the file should contain our error message
        self.assertNotEqual(line.find(error_msg), -1)
Example #7
0
def evaluate_line(input_str):
    config = load_config(file_path + "/" + FLAGS.config_file)
    logger = get_logger(FLAGS.log_file)
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    with open(file_path + "/" + FLAGS.map_file, "rb") as f:
        char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, file_path + "/" + FLAGS.ckpt_path,
                             load_word2vec, config, id_to_char, logger)
        # for report in input_str:
        #     result = model.evaluate_line(sess, input_from_line(report, char_to_id), id_to_tag)
        #     print(result)

        report = input_str
        result = model.evaluate_line(sess, input_from_line(report, char_to_id),
                                     id_to_tag)
        print(result)
        return result
Example #8
0
import sys
import os
import time
import argparse
from core.collection import DataCollector
from core.utils import get_logger
logger = get_logger('collect', dest=['console', 'file'])

if __name__ == "__main__":

	parser = argparse.ArgumentParser(description='Collect data')
	parser.add_argument('-s', '--simulate', 
		action='store', 
		dest='simulate', 
		default=None, 
		help='''Simulate data collection''')
	parser.add_argument('-i', '--interval',
		action='store',
		dest='interval',
		default='2',
		help='''Interval between scans, in seconds. Only active if simulate is True''')
	parser.add_argument('-d', '--directory',
		action='store',
		dest='directory',
		default='tmp',
		help='Directory to watch')
	parser.add_argument('-p', '--parent',
		action='store_true',
		dest='parent',
		default=False,
		help='Monitor the provided directory for the first new folder, then monitor that folder for new files')
Example #9
0
from core.engine import train, evaluate, LargerHolder
from core.metric import AverageMetric, AccuracyMetric
from core.model import cifar_resnet20
from core.loss import CACLoss
from core.utils import get_args, get_logger
from core.utils import set_cudnn_auto_tune
from core.utils import FLOPs
from core.utils import replace_convs_with_cac

if __name__ == "__main__":
    args = get_args()
    hocon = pyhocon.ConfigFactory.parse_file(args.config)
    output_directory = args.output_directory
    os.makedirs(output_directory, exist_ok=False)
    logger = get_logger("train", output_directory)

    set_cudnn_auto_tune()

    device = "cuda:0" if torch.cuda.is_available() else "cpu"

    train_transform = torchvision.transforms.Compose([
        torchvision.transforms.RandomCrop(size=32, padding=4),
        torchvision.transforms.RandomHorizontalFlip(),
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize(
            mean=hocon.get_list("dataset.mean"),
            std=hocon.get_list("dataset.std"),
        ),
    ])
    val_transform = torchvision.transforms.Compose([
Example #10
0
import requests
from bs4 import BeautifulSoup as bs
import traceback

from core.db import insert
from core.utils import (
    parse_time,
    headers,
    logger_time,
    get_logger,
)

SRC_URL = 'http://openinsider.com/screener?s={}&o=&pl=&ph=&ll=&lh=&fd=0&fdr=&td=0&tdr=&fdlyl=&fdlyh=&daysago=&xp=1&xs=1&vl=&vh=&ocl=&och=&sic1=-1&sicl=100&sich=9999&grp=0&nfl=&nfh=&nil=&nih=&nol=&noh=&v2l=&v2h=&oc2l=&oc2h=&sortcol=0&cnt=10000000&page=1'

logger = get_logger('openinsider-service')


def get_request(insider_url, ticker):
    logger.info(f'Get request with tickername: {ticker}')
    session = requests.Session()
    data = session.get(insider_url.format(ticker),
                       headers=headers,
                       stream=True)
    return data.text


@logger_time
def get_data(html):
    soup = bs(html, 'lxml')
    rows = soup.find('table', class_='tinytable').find('tbody').find_all('tr')
    for i in rows:
Example #11
0
from typing import Dict, List, Tuple, AnyStr
from datetime import datetime

import requests
from pytz import timezone

from core.utils import (
    parse_time,
    headers,
    get_logger,
    url,
    logger_time,
)
from core.db import insert

logger = get_logger('pulse-service')


def get_cursor_number(url: str, ticker: str, cursor='9999999') -> Dict:
    session = requests.Session()
    logger.info(f'Get cursor number from {url.format(ticker, cursor)}')
    data = session.get(url.format(ticker, cursor),
                       headers=headers,
                       stream=True)
    logger.info(
        f"Prev cursor number is {data.json()['payload']['nextCursor']}")
    return data.json()['payload']['nextCursor']


def get_data_from_api(url: str, ticker: str, cursor: str) -> None:
    session = requests.Session()
Example #12
0
import os
import time
import argparse
from core.preprocessing import Preprocessor
from core.utils import get_logger

logger = get_logger("preprocess", dest=["console", "file"])

if __name__ == "__main__":

    parser = argparse.ArgumentParser(description="Preprocess data")
    parser.add_argument("config", action="store", help="Name of configuration file")
    args = parser.parse_args()
    logger.info("Loading preprocessing pipeline from %s" % args.config)

    preproc = Preprocessor(args.config)
    preproc.run()
Example #13
0
def train():
    # 加载数据集
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # 选择tag形式 (IOB / IOBES)  默认使用IOBES
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    if not os.path.isfile(FLAGS.map_file):
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            # {'S-LOC': 10, 'E-LOC': 3, 'B-ORG': 4, 'S-PER': 11, 'S-ORG': 12, 'O': 0,
            # 'E-ORG': 5, 'I-LOC': 6, 'I-PER': 7, 'I-ORG': 1, 'B-PER': 8, 'B-LOC': 2, 'E-PER': 9}
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # 转化成数字化的数据
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), len(dev_data), len(test_data)))

    # 长度不足补0
    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)

    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # GPU设置
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data

    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    # 每100次算一次平均loss
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
Example #14
0
#!/usr/bin/env python
import os
import time
import argparse
from core.stimulation import Stimulator
from core.utils import get_logger
logger = get_logger('stimulate', dest=['console', 'file'])

if __name__=='__main__':
	parser = argparse.ArgumentParser(description='Preprocess data')
	parser.add_argument('config',
		action='store',
		nargs='?',
		default='stim-01',
		help='Name of configuration file')
	args = parser.parse_args()

	stim = Stimulator(args.config)
	stim.run() # this will start an infinite run loop
Example #15
0
from django import forms
from django.forms.widgets import Select, HiddenInput
from core.models import Comment,CommentType,Topic
from tagger.models import Tag
from users.models import UserProfile
from django.forms.widgets import CheckboxSelectMultiple
from core import utils

logger = utils.get_logger(__name__)

class CommentDeleteForm(forms.Form):
    allcomments = Comment.objects.filter(is_deleted=False).filter(is_parent=True)
    comments = forms.ModelMultipleChoiceField(allcomments, )


class CommentTopicForm(forms.Form):
    allcomments = Comment.objects.filter(is_deleted=False).filter(is_parent=True)
    alltopics = Topic.objects.filter(is_deleted=False)
    comment = forms.ModelChoiceField(allcomments, empty_label = None)
    topic = forms.ModelChoiceField(alltopics, empty_label = None)

    
class TopicDeleteForm(forms.Form):
    alltopics = Topic.objects.filter(is_deleted=False)
    topics = forms.ModelMultipleChoiceField(alltopics, )


class NewSummaryForm(forms.Form):
    """ Form to let a user create a new summary for a topic"""

    alltopics = Topic.objects.filter(is_deleted=False)