def __connect(self, wsURL): '''Connect to the websocket in a thread.''' self.logger.debug("Starting thread") ssl_defaults = ssl.get_default_verify_paths() sslopt_ca_certs = {'ca_certs': ssl_defaults.cafile} self.ws = websocket.WebSocketApp(wsURL, on_message=self.__on_message, on_close=self.__on_close, on_open=self.__on_open, on_error=self.__on_error, header=self.__get_auth()) setup_custom_logger('websocket', log_level=settings.LOG_LEVEL) self.wst = threading.Thread( target=lambda: self.ws.run_forever(sslopt=sslopt_ca_certs)) self.wst.daemon = True self.wst.start() self.logger.info("Started thread") # Wait for connect before continuing conn_timeout = 5 while (not self.ws.sock or not self.ws.sock.connected ) and conn_timeout and not self._error: sleep(1) conn_timeout -= 1 if not conn_timeout or self._error: self.logger.error("Couldn't connect to WS! Exiting.") self.exit() sys.exit(1)
def main(options): global logger logger = setup_custom_logger('root', filename=options.log_file) scraper = create_scraper_by_type(options.conference) papers = scraper.scrape_list_of_papers(options.conference_program_url) work_data = [] for date, papers_by_date in papers.iteritems(): ensure_dir(os.path.join(options.destination_dir, date)) for session, papers_by_session in papers_by_date.iteritems(): ensure_dir(os.path.join(options.destination_dir, date, session)) for section, papers_by_section in papers_by_session.iteritems(): base_dir = os.path.join(options.destination_dir, date, session, section) ensure_dir(base_dir) if not os.listdir(base_dir): logger.info( 'Directory is empty {}, will process it'.format( base_dir)) work_data.append((base_dir, papers_by_section)) else: logger.info( 'Something is in {}, skipping it'.format(base_dir)) # download_papers(work_data[0]) pool = mp.Pool(8) for results in pool.imap_unordered(download_papers, work_data): for paper_name, paper_file in results: print paper_name print ' ' + paper_file pool.close() pool.join()
def download_papers(data): global logger logger = logging.getLogger('root') if not logger.handlers: logger = setup_custom_logger('root') base_dir, papers_by_section = data gd = GoogleDownloader() results = [] for paper in papers_by_section: try: print paper folder = '{authors}_{paper-name}'.format(**paper) paper_dir = os.path.join(base_dir, folder) ensure_dir(paper_dir) possible_pdfs = gd.search_paper_file(paper['paper-name']) existing_hashes = [ hashfile(open(os.path.join(paper_dir, filename), 'rb'), hashlib.md5()) for filename in os.listdir(paper_dir) ] for link_text, pdf_url in possible_pdfs: filename = os.path.join(paper_dir, link_text + '.pdf') i = 1 while os.path.exists(filename): i += 1 filename = os.path.join(paper_dir, link_text + str(i) + '.pdf') print link_text logger.info( 'Downloading search result "{}" with URL {} to file "{}"'. format(link_text, pdf_url, filename)) response = requests.get(pdf_url, stream=True, verify=False) with open(filename, 'wb') as out_file: shutil.copyfileobj(response.raw, out_file) with open(filename, 'rb') as downloaded_file: file_hash = hashfile(downloaded_file, hashlib.md5()) if file_hash in existing_hashes: os.unlink(filename) logger.info( 'Hash of downloaded file is {} ' 'and we already have this file; removing it'.format( file_hash)) else: existing_hashes.append(file_hash) logger.info('Hash of downloaded file is {}, ' 'adding it to folder'.format(file_hash)) results.append((folder, filename)) except Exception as e: logger.exception('Download of paper "{}" failed'.format(folder)) return results
import json import re import pandas as pd import tqdm from utils import log from datasets.preprocess_utils import download_raw_and_preprocess logger = log.setup_custom_logger(__name__) REPLACE_TOKS = [ ("#39;", "'"), ("#36;", "$"), (">", ">"), ("<", "<"), ("\\$", "$"), ("quot;", "\""), ("\\", " "), ("#145;", "\""), ("#146;", "\""), ("#151;", "-") ] def preprocess_ag_data(input_filename, output_filename, include_title=True, include_author_media=True): """preprocess raw AG's news csv to Fibber's JSON format.""" logger.info("Start preprocessing data, and save at %s.", output_filename) df = pd.read_csv(input_filename, header=None)
try: # If it not a class skip it site_class = getattr(sys.modules[__name__], site.lower()) except AttributeError as e: print("\nThere is no module named " + site + "\n") continue dl_path = os.path.expanduser(config[site]['download_path']) # Create dl path if not there try: os.makedirs(dl_path) except Exception as e: pass num_files = int(config[site]['number_of_files']) progress_file = config[site]['progress_file'].lower() threads = int(config[site]['threads']) log_file = os.path.join(dl_path, site + '.log') logger = setup_custom_logger('root', log_file) try: search = config[site]['search'].split(',') except KeyError as e: search = [] if search: for term in search: site_term = site + ":" + term scrape[site_term] = Process(site_class, dl_path, progress_file, term, num_files, threads) else: scrape[site] = Process(site_class, dl_path, progress_file, '', num_files, threads) # Start site parser
from utils.log import setup_custom_logger logger = setup_custom_logger('root') class TimeIntervalError(Exception): def __init__(self): logger.error('TimeIntervalError') def __str__(self): return 'TimeIntervalError: end time should 1h after the start time' class BinanceAPIException(Exception): def __init__(self, response): self.code = 0 try: json_res = response.json() except ValueError: self.message = 'Invalid JSON error message from Binance: {}'.format(response.text) else: self.code = json_res['code'] self.message = json_res['msg'] logger.error(self.message) self.status_code = response.status_code self.response = response self.request = getattr(response, 'request', None) def __str__(self): # pragma: no cover return 'APIError(code=%s): %s' % (self.code, self.message)
"""This metric computes the cosine similarity between two sentences. The sentence embedding is the sum of GloVe word embeddings.""" import numpy as np from nltk import word_tokenize from utils import log from metrics.metric_base import MetricBase from utils import get_glove_emb, get_nltk_data, get_stopwords logger = log.setup_custom_logger('glove_semantic_similairty') def compute_emb(emb_table, tok_to_id, x): """Compute the sum of word embeddings for a sentence. Args: emb_table (np.array): the glove embedding table. tok_to_id (dict): a dict mapping strs to ints. x (str): text. Returns: (np.array): the sum of word embedding. """ toks = word_tokenize(x) embs = [] for item in toks: if item.lower() in tok_to_id: embs.append(emb_table[tok_to_id[item.lower()]]) return np.sum(embs, axis=0)
import atexit import signal import bitmex from settings import settings from utils import log, constants, errors, math # Used for reloading the bot - saves modified times of key files import os watched_files_mtimes = [(f, getmtime(f)) for f in settings.WATCHED_FILES] # # Helpers # logger = log.setup_custom_logger('root') class ExchangeInterface: def __init__(self, dry_run=False): self.dry_run = dry_run if len(sys.argv) > 1: self.symbol = sys.argv[1] else: self.symbol = settings.SYMBOL self.bitmex = bitmex.BitMEX(base_url=settings.BASE_URL, symbol=self.symbol, apiKey=settings.API_KEY, apiSecret=settings.API_SECRET, orderIDPrefix=settings.ORDERID_PREFIX, postOnly=settings.POST_ONLY,
if config[site]['enabled'].lower() == 'true': try: # If it not a class skip it site_class = getattr(sys.modules[__name__], site.lower()) except AttributeError as e: print("\nThere is no module named " + site + "\n") continue dl_path = os.path.expanduser(config[site]['download_path']) # Create dl path if not there try: os.makedirs(dl_path) except Exception as e: pass num_files = int(config[site]['number_of_files']) threads = int(config[site]['threads']) log_file = os.path.join(dl_path, site + '.log') logger = setup_custom_logger('root', log_file) try: search = config[site]['search'].split(',') except KeyError as e: search = [] if search: for term in search: site_term = site + ":" + term scrape[site_term] = Process(site_class, dl_path, term, num_files, threads) else: scrape[site] = Process(site_class, dl_path, '', num_files, threads) # Start site parser try: for site in scrape: print("#### Scrapeing: " + site)
sys.exit(0) config.read(config_file) # Read scrap config file scrape_config_file = './configs/scrape.ini' if not os.path.isfile(scrape_config_file): print("Scrape config file not found: " + scrape_config_file) sys.exit(0) config.read(scrape_config_file) # Verify config # Check that there is a log file to write to log_path = utils.create_path(os.path.expanduser(config['parser']['log_path']), is_dir=True) # Create logger to use logger = setup_custom_logger('root', os.path.join(log_path, "reddit_scraper.log")) # Check save path save_path = utils.create_path(os.path.expanduser(config['parser']['save_path']), is_dir=True) # Just json is_just_json = False if config['parser']['just_json'].strip().lower() == 'true': is_just_json = True # make sure that we are not saving in a dir where reddit content is saved if os.path.isdir(os.path.join(save_path, "user")): print("The save directory seems to be where you save reddit content\n \ Please pick a location that will be just for json files.") sys.exit(0) # Create file to say this is a json only directory open(os.path.join(save_path, "only_json.lock"), 'a').close()