def enhance(pattern, rules, n): size = len(pattern) # square_rows becomes a list of lists of grids square_rows = [] # split pattern into rows for rows in chunked_iter(pattern, n): squares = [[] for _ in range(size // n)] for row in rows: # split rows into columns, appending each row to the correct square # in the list we created above for i, c in enumerate(chunked_iter(row, n)): squares[i].append(c) square_rows.append(squares) # now enhance each square that we created above for y, squares in enumerate(square_rows): for x, square in enumerate(as_tuple_grid(s) for s in squares): square_rows[y][x] = rules[square] # convert square_rows back into a normal grid out = [] for squares in square_rows: rows = ['' for _ in range(len(squares[0]))] for square in squares: for i, row in enumerate(square): rows[i] += ''.join(row) out.extend(tuple(r) for r in rows) return tuple(out)
def tag( model: torch.nn.Module, data: ty.Iterable, batch_size: int = 128 ) -> ty.List[ty.Tuple[int, ty.List[float]]]: """Tag a dataset Output: (tag, scores) """ device = next(model.parameters()).device model.eval() sys_out = [] # type: ty.List[ty.Tuple[int, ty.List[float]]] if isinstance(data, ty.Sized): data_len = (len(data) - 1) // batch_size + 1 # type: ty.Optional[int] else: data_len = None data = map(datatools.FeaturefulSpan.collate, itu.chunked_iter(data, batch_size)) pbar = tqdm.tqdm( data, total=data_len, unit="batch", desc="Tagging", mininterval=2, unit_scale=True, dynamic_ncols=True, disable=None, leave=False, ) with torch.no_grad(): for d in pbar: r = model(datatools.move(d, device=device)) sys_tags = r.argmax(dim=-1).tolist() scores = r.exp().tolist() sys_out.extend(zip(sys_tags, scores)) return sys_out
def invoke_semgrep(semgrep_args: List[str], targets: List[str]) -> Dict[str, List[Any]]: """ Call semgrep passing in semgrep_args + targets as the arguments Returns json output of semgrep as dict object """ output: Dict[str, List[Any]] = {"results": [], "errors": []} for chunk in chunked_iter(targets, PATHS_CHUNK_SIZE): with tempfile.NamedTemporaryFile("w") as output_json_file: args = semgrep_args.copy() args.extend([ "-o", output_json_file. name, # nosem: python.lang.correctness.tempfile.flush.tempfile-without-flush ]) for c in chunk: args.append(c) _ = semgrep_exec(*(args)) with open( output_json_file. name # nosem: python.lang.correctness.tempfile.flush.tempfile-without-flush ) as f: parsed_output = json.load(f) output["results"].extend(parsed_output["results"]) output["errors"].extend(parsed_output["errors"]) return output
def create_graph(client=None): # In a cluster the motif annotations need to be broadcasted to all nodes. Otherwise # the motif annotations need to wrapped in a delayed() construct to avoid needless pickling and # unpicking between processes. delayed_or_future_annotations = client.scatter(motif_annotations, broadcast=True) if client \ else delayed(motif_annotations, pure=True) # Chunking the gene signatures might not be necessary anymore because the overhead of the dask # scheduler is minimal (cf. blog http://matthewrocklin.com/blog/work/2016/05/05/performant-task-scheduling). # The original behind the decision to implement this was the refuted assumption that fast executing tasks # would greatly be impacted by scheduler overhead. The chunking of signatures seemed to corroborate # this assumption. However, the benefit was through less pickling and unpickling of the motif annotations # dataframe as this was not wrapped in a delayed() construct. # Remark on sharing ranking databases across a cluster. Because the frontnodes of the VSC for the LCB share # a file server and have a common home folder configured, these database (stored on this shared drive) # can be accessed from all nodes in the cluster and can all use the same path in the configuration file. # A potential improvement to reduce I/O contention for this shared drive (accessing the ranking # database) would be to load the database in memory (using the available decorator) for each task. # The penalty of loading the database in memory should be shared across multiple gene signature so # in this case chunking of gene signatures is mandatory to avoid severe performance penalties. # However, because of the memory need of a node running pyscenic is already high (i.e. pre-allocation # of recovery curves - 20K features (max. enriched) * rank_threshold * 8 bytes (float) * num_cores), # this might not be a sound idea to do. return aggregate_func( (delayed(transform_func)(db, gs_chunk, delayed_or_future_annotations) for db in rnkdbs for gs_chunk in chunked_iter(modules, module_chunksize)))
def load_db(self, n=1000): """Load database rows. """ rows = tqdm(self.db_rows_iter()) for rows in chunked_iter(iter(rows), n): session.bulk_save_objects(rows) session.commit()
def _import_events(cls, f: BinaryIO, full_name: str, company_id: str, _): _, _, task_id = full_name[0 : -len(cls.events_file_suffix)].rpartition("_") print(f"Writing events for task {task_id} into database") for events_chunk in chunked_iter(cls.json_lines(f), 1000): events = [json.loads(item) for item in events_chunk] cls.event_bll.add_events( company_id, events=events, worker="", allow_locked_tasks=True )
def parse( model_path: Union[str, pathlib.Path], in_file: Union[str, pathlib.Path, IO[str]], out_file: Union[str, pathlib.Path, IO[str]], batch_size: Optional[int] = None, overrides: Optional[Dict[str, str]] = None, raw: bool = False, strict: bool = True, ): parser = BiAffineParser.load(model_path, overrides) if batch_size is None: batch_size = parser.default_batch_size print("Encoding", file=sys.stderr) with smart_open(in_file) as in_stream: batches: Union[Iterable[DependencyBatch], Iterable[SentencesBatch]] if raw: sentences = ( encoded for line in in_stream if line and not line.isspace() for encoded in [parser.encode_sentence(line.strip().split(), strict=strict)] if encoded is not None) batches = (parser.batch_sentences(sentences) for sentences in itu.chunked_iter( sentences, size=batch_size, )) else: test_set = DependencyDataset( DepGraph.read_conll(in_file), parser.lexer, parser.char_rnn, parser.ft_lexer, use_labels=parser.labels, use_tags=parser.tagset, ) batches = (test_set.make_single_batch(sentences) for sentences in itu.chunked_iter( test_set.treelist, size=parser.default_batch_size)) print("Parsing", file=sys.stderr) with smart_open(out_file, "w") as ostream: parser.batched_predict( batches, cast(IO[str], ostream), greedy=False, )
def invoke_semgrep_sarif( semgrep_args: List[str], targets: List[str], *, timeout: Optional[int], explicit_semgrepignore_path: Optional[str] = None, ) -> Tuple[int, Dict[str, List[Any]]]: """ Call semgrep passing in semgrep_args + targets as the arguments Returns sarif output of semgrep as dict object """ output: Dict[str, List[Any]] = {} max_exit_code = 0 _env = ({ "SEMGREP_R2C_INTERNAL_EXPLICIT_SEMGREPIGNORE": explicit_semgrepignore_path, **os.environ, } if explicit_semgrepignore_path else os.environ) for chunk in chunked_iter(targets, PATHS_CHUNK_SIZE): with tempfile.NamedTemporaryFile("w") as output_json_file: args = semgrep_args.copy() args.extend(["--debug", "--sarif"]) args.extend([ "-o", output_json_file. name, # nosem: python.lang.correctness.tempfile.flush.tempfile-without-flush ]) for c in chunk: args.append(c) exit_code = semgrep_exec(*args, _timeout=timeout, _err=debug_echo, _env=_env).exit_code max_exit_code = max(max_exit_code, exit_code) with open( output_json_file. name # nosem: python.lang.correctness.tempfile.flush.tempfile-without-flush ) as f: parsed_output = json.load(f) if len(output) == 0: output = parsed_output else: output["runs"][0]["results"].extend( parsed_output["runs"][0]["results"]) output["runs"][0]["tool"]["driver"]["rules"].extend( parsed_output["runs"][0]["tool"]["driver"]["rules"]) return max_exit_code, output
def get_by_filename_remote(filenames, chunk_size=200): file_infos = [] warnings = [] for filenames_chunk in chunked_iter(filenames, chunk_size): params = {'names': filenames_chunk} url = REMOTE_UTILS_URL + '/file' resp, no_infos = get_from_remote(url, params) if no_infos: # print '!! info missing for %s' % no_infos warnings += no_infos file_infos += resp return file_infos, warnings
def load_db(self, chunk_size=1000): """Write db rows. Args: chunk_size (int): Insert page size. """ rows = self.db_rows() chunks = chunked_iter(rows, chunk_size) for i, chunk in enumerate(chunks): session.bulk_save_objects(chunk) session.commit() print(dt.now().isoformat(), i)
def wrapper(self, iterable: Iterable, **kwargs): assert iterutils.is_collection( iterable ), "The positional parameter should be an iterable for breaking into chunks" func_with_params = functools.partial(func, self, **kwargs) with ThreadPoolExecutor() as pool: return list( itertools.chain.from_iterable( filter( None, pool.map( func_with_params, iterutils.chunked_iter(iterable, chunk_size), ), ) ), )
def main(self): a = self.args(self.text[2:]) if a.count > 10 and self.mc2['vips'] == False: a.count = 10 elif a.count > 30: a.count = 30 image_url = [] try: if a.count > 4: self.sendmsg("начинаю качать пикчи") for _ in range(a.count): image_url.append(nekos.img(self.text[1])) image = self.multithreadwoload(image_url) image = list(iterutils.chunked_iter(image.split(","), 10)) for image in image: images = ",".join(image) self.sendmsg("Держи!", images) except: self.sendmsg("""Введи один из этих аргументов, с цифрой на конце или с -c 5 - скинет указанное количество пикч: feet, yuri, trap, futanari, hololewd, lewdkemo, solog, feetg, cum, erokemo, les, wallpaper, lewdk, ngif, tickle, lewd, feed, gecg, eroyuri, eron, cum_jpg, bj, nsfw_neko_gif, solo, kemonomimi, nsfw_avatar, gasm, poke, anal, slap, hentai, avatar, erofeet, holo, keta, b*****b, pussy, t**s, holoero, lizard, pussy_jpg, pwankg, classic, kuni, waifu, pat, 8ball, kiss, femdom, neko, spank, cuddle, erok, fox_girl, boobs, random_hentai_gif, smallboobs, hug, ero, smug, goose, baka""")
def chunked(self, size, fill=_MISSING): """Return a new :class:`Iter()` spec which groups elements in the iterable into lists of length *size*. If the optional *fill* argument is provided, iterables not evenly divisible by *size* will be padded out by the *fill* constant. Otherwise, the final chunk will be shorter than *size*. >>> list(glom(range(10), Iter().chunked(3))) [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]] >>> list(glom(range(10), Iter().chunked(3, fill=None))) [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, None, None]] """ kw = {'size': size} args = size, if fill is not _MISSING: kw['fill'] = fill args += (fill, ) return self._add_op('chunked', args, lambda it, scope: chunked_iter(it, **kw))
def saveSingleRollout(i): subsampled_paths_per_thread = [] # print("Rollout number is ", i) # obs_skip = 15 # np.random.randint(7, 28) # num_datapoints = int(2500 / (instancesNum * obs_skip)) path = perform_rollout(policy, environment, debug=False, animate=opt['animate'], control_step_skip=opt['action_skip']) useful_path_data = path['observations'][::] # useful_path_data_reverse = useful_path_data[::-1] # print(len(useful_path_data)) # useful_path_data_combined = useful_path_data + useful_path_data_reverse split_paths = list( iterutils.chunked_iter(useful_path_data, instancesNum * obs_skip)) # Split a rollout in to segments of 50 time steps # print(len(split_paths)) # if (num_datapoints > len(split_paths) - 1): num_datapoints = len(split_paths) - 1 # print("WHYWHY") for j in range(num_datapoints): observations = split_paths[j] obs_sample = [] # From each 50 steps subsample instancesNum steps and store for k in range(instancesNum): obs_sample.append( observations[int(len(observations) / instancesNum) * k][ignoreObs::]) subsampled_paths_per_thread.append(obs_sample) # paths.extend(split_paths) # paths.append(path) # print(np.shape(subsampled_paths_per_thread)) return subsampled_paths_per_thread
def _batch( self, entries: Any, key: str, operation: Callable[..., Dict[str, str]], raise_on_error: bool = False, apply: Callable[..., Any] = lambda x: x, ) -> Dict[str, List[bool]]: """[summary] Args: entries (Any): [description] key (str): [description] operation (Callable[..., Dict[str, str]]): [description] raise_on_error (bool): [description]. Defaults to False. apply (Callable[..., Any]): [description]. Defaults to lambdax:x. Returns: Dict[str, List[bool]]: [description] Raises: Exception """ res_list = [] for i_chunk, chunk in enumerate(chunked_iter(entries, 10)): payload = [{ "Id": str(i_chunk * 10 + i), key: apply(m) } for i, m in enumerate(chunk)] res = operation(QueueUrl=self.queue_url, Entries=payload) print(res) if raise_on_error and res.get("Failed"): raise (Exception) res_list.append(res) return reduce( lambda c, r: { key: c.get(key, []) + r.get(key, []) for key in ["Successful", "Failed"] }, res_list, # type: ignore )
def main(self): lhelp = [] mhelp = "\n" allowedtype = ["command", "specialcommand"] for moduli in mods.modules: if moduli.types in allowedtype and moduli.available_for != "admins" and moduli.included: lhelp.append(dict(command=moduli.command, doc=moduli.doc)) lhelp = list(iterutils.chunked_iter(lhelp, 11)) lhelp = [dict(command="уходи от", doc="сюда мужик")] + lhelp try: number = int(self.text[1]) lhelp2 = lhelp[number] except: number = 1 lhelp2 = lhelp[1] for moduli in lhelp2: mhelp += f"• {', '.join(moduli['command'])} - {moduli['doc']} \n" mhelp += f"Страница: {number} \n" mhelp += f"Всего страниц: {len(lhelp)-1} \n" mhelp += "Пример переключения на другую страницу: /хелп 3" self.sendmsg(mhelp)
def invoke_semgrep(ctx: click.Context) -> FindingSets: debug_echo("=== adding semgrep configuration") workdir = Path.cwd() targets = TargetFileManager( base_path=workdir, base_commit=ctx.obj.meta.base_commit_ref, paths=[workdir], ignore_rules_file=get_semgrepignore(ctx.obj.sapp.scan), ) debug_echo("=== seeing if there are any findings") findings = FindingSets() with targets.current_paths() as paths, get_semgrep_config( ctx) as config_args: click.echo("=== looking for current issues in " + unit_len(paths, "file")) for chunk in chunked_iter(paths, PATHS_CHUNK_SIZE): args = ["--json", *config_args] for path in chunk: args.extend(["--include", path]) findings.current.update( Finding.from_semgrep_result(result, ctx) for result in json.loads(str(semgrep(*args)))["results"]) click.echo( f"| {unit_len(findings.current, 'current issue')} found") if not findings.current: click.echo( "=== not looking at pre-existing issues since there are no current issues" ) else: with targets.baseline_paths() as paths, get_semgrep_config( ctx) as config_args: if paths: paths_with_findings = { finding.path for finding in findings.current } paths_to_check = set(str(path) for path in paths) & paths_with_findings click.echo("=== looking for pre-existing issues in " + unit_len(paths_to_check, "file")) for chunk in chunked_iter(paths_to_check, PATHS_CHUNK_SIZE): args = ["--json", *config_args] for path in chunk: args.extend(["--include", path]) findings.baseline.update( Finding.from_semgrep_result(result, ctx) for result in json.loads(str(semgrep( *args)))["results"]) click.echo( f"| {unit_len(findings.baseline, 'pre-existing issue')} found" ) if os.getenv("INPUT_GENERATESARIF"): # FIXME: This will crash when running on thousands of files due to command length limit click.echo("=== re-running scan to generate a SARIF report") sarif_path = Path("semgrep.sarif") with targets.current_paths() as paths, sarif_path.open( "w") as sarif_file, get_semgrep_config(ctx) as config_args: args = ["--sarif", *config_args] for path in paths: args.extend(["--include", path]) semgrep(*args, _out=sarif_file) rewrite_sarif_file(sarif_path) return findings
def _distributed_calc( rnkdbs: Sequence[Type[RankingDatabase]], modules: Sequence[Type[GeneSignature]], motif_annotations_fname: str, transform_func: Callable[ [Type[RankingDatabase], Sequence[Type[GeneSignature]], str], T], aggregate_func: Callable[[Sequence[T]], T], motif_similarity_fdr: float = 0.001, orthologuous_identity_threshold: float = 0.0, client_or_address='custom_multiprocessing', num_workers=None, module_chunksize=100) -> T: """ Perform a parallelized or distributed calculation, either pruning targets or finding enriched motifs. :param rnkdbs: A sequence of ranking databases. :param modules: A sequence of gene signatures. :param motif_annotations_fname: The filename of the motif annotations to use. :param transform_func: A function having a signature (Type[RankingDatabase], Sequence[Type[GeneSignature]], str) and that returns Union[Sequence[Regulon]],pandas.DataFrame]. :param aggregate_func: A function having a signature: - (Sequence[pandas.DataFrame]) => pandas.DataFrame - (Sequence[Sequence[Regulon]]) => Sequence[Regulon] :param motif_similarity_fdr: The maximum False Discovery Rate to find factor annotations for enriched motifs. :param orthologuous_identity_threshold: The minimum orthologuous identity to find factor annotations for enriched motifs. :param client_or_address: The client of IP address of the scheduler when working with dask. For local multi-core systems 'custom_multiprocessing' or 'dask_multiprocessing' can be supplied. :param num_workers: If not using a cluster, the number of workers to use for the calculation. None of all available CPUs need to be used. :param module_chunksize: The size of the chunk in signatures to use when using the dask framework. :return: A pandas dataframe or a sequence of regulons (depends on aggregate function supplied). """ def is_valid(client_or_address): if isinstance(client_or_address, str) and ((client_or_address in { "custom_multiprocessing", "dask_multiprocessing", "local" }) or IP_PATTERN.fullmatch(client_or_address)): return True elif isinstance(client_or_address, Client): return True return False assert is_valid( client_or_address ), "\"{}\"is not valid for parameter client_or_address.".format( client_or_address) # Make sure warnings and info are being logged. if not len(LOGGER.handlers): LOGGER.addHandler(create_logging_handler(False)) if LOGGER.getEffectiveLevel() > logging.INFO: LOGGER.setLevel(logging.INFO) if client_or_address == 'custom_multiprocessing': # CUSTOM parallelized implementation. # This implementation overcomes the I/O-bounded performance. Each worker (subprocess) loads a dedicated ranking # database and motif annotation table into its own memory space before consuming module. The implementation of # each worker uses the AUC-first numba JIT based implementation of the algorithm. assert len(rnkdbs) <= num_workers if num_workers else cpu_count( ), "The number of databases is larger than the number of cores." amplifier = int( (num_workers if num_workers else cpu_count()) / len(rnkdbs)) LOGGER.info("Using {} workers.".format(len(rnkdbs) * amplifier)) receivers = [] for db in rnkdbs: for idx, chunk in enumerate( chunked_iter(modules, ceil(len(modules) / float(amplifier)))): sender, receiver = Pipe() receivers.append(receiver) Worker("{}({})".format(db.name, idx + 1), db, chunk, motif_annotations_fname, sender, motif_similarity_fdr, orthologuous_identity_threshold, transform_func).start() # Retrieve the name of the temporary file to which the data is stored. This is a blocking operation. fnames = [recv.recv() for recv in receivers] # Load all data from disk and concatenate. def load(fname): with open(fname, 'rb') as f: return pickle.load(f) try: return aggregate_func(list(map(load, fnames))) finally: # Remove temporary files. for fname in fnames: os.remove(fname) else: # DASK framework. # Load motif annotations. motif_annotations = load_motif_annotations( motif_annotations_fname, motif_similarity_fdr=motif_similarity_fdr, orthologous_identity_threshold=orthologuous_identity_threshold) # Create dask graph. def create_graph(client=None): # In a cluster the motif annotations need to be broadcasted to all nodes. Otherwise # the motif annotations need to wrapped in a delayed() construct to avoid needless pickling and # unpicking between processes. delayed_or_future_annotations = client.scatter(motif_annotations, broadcast=True) if client \ else delayed(motif_annotations, pure=True) # Chunking the gene signatures might not be necessary anymore because the overhead of the dask # scheduler is minimal (cf. blog http://matthewrocklin.com/blog/work/2016/05/05/performant-task-scheduling). # The original behind the decision to implement this was the refuted assumption that fast executing tasks # would greatly be impacted by scheduler overhead. The chunking of signatures seemed to corroborate # this assumption. However, the benefit was through less pickling and unpickling of the motif annotations # dataframe as this was not wrapped in a delayed() construct. # Remark on sharing ranking databases across a cluster. Because the frontnodes of the VSC for the LCB share # a file server and have a common home folder configured, these database (stored on this shared drive) # can be accessed from all nodes in the cluster and can all use the same path in the configuration file. # A potential improvement to reduce I/O contention for this shared drive (accessing the ranking # database) would be to load the database in memory (using the available decorator) for each task. # The penalty of loading the database in memory should be shared across multiple gene signature so # in this case chunking of gene signatures is mandatory to avoid severe performance penalties. # However, because of the memory need of a node running pyscenic is already high (i.e. pre-allocation # of recovery curves - 20K features (max. enriched) * rank_threshold * 8 bytes (float) * num_cores), # this might not be a sound idea to do. return aggregate_func( (delayed(transform_func)(db, gs_chunk, delayed_or_future_annotations) for db in rnkdbs for gs_chunk in chunked_iter(modules, module_chunksize))) # Compute dask graph ... if client_or_address == "dask_multiprocessing": # ... via multiprocessing. return create_graph().compute( get=get, num_workers=num_workers if num_workers else cpu_count()) else: # ... via dask.distributed framework. client, shutdown_callback = _prepare_client( client_or_address, num_workers=num_workers if num_workers else cpu_count()) try: return client.compute(create_graph(client), sync=True) finally: shutdown_callback(False)
def batches(self, size): """Iterate all batches. """ for grafs in chunked_iter(self.grafs, size): yield Batch(grafs)
help= 'Upper bound on the number of processes that can be launched in parallel. Default value is ' 'the number of cores on your machine.', nargs='?', default=mp.cpu_count(), type=int) args = parser.parse_args() list_bigfiles = [ bigfile for bigfile in os.listdir(args.directory) if args.pattern in bigfile ] process_dict = {} if __name__ == '__main__': for bigfile in list_bigfiles: process_dict[bigfile] = mp.Process(name=bigfile, target=parallel_upload_gcs, args=( args.directory + bigfile, args.bucket, )) # print(args) for sublist_bigfiles in iterutils.chunked_iter(list_bigfiles, args.nb_process): for bigfile in sublist_bigfiles: process_dict[bigfile].start() for bigfile in sublist_bigfiles: process_dict[bigfile].join()
def main(start_date_, working_dir_, nblocks_, email_notification_, top_): """ The parametrized main function for CLI in the cloud """ # use the following command: # rm -r temp/*; python test.py --top 1000 -s 2018-01-01 #-dir ./temp/ -nblocks 100 --email-notification # on Mac terminal from the dir where you have test.py # comand line arguments; use comments below as an example #TOP = 10000000 # reduce TOP value to 10 for debugging; put it to inf for a full run #DATE = '2017-01-01' # 'from' parameter for historical pricing data #WORKING_DIR = './refinitiv_qa_direct_qai_master_and_pricing_tables/'\ # +str(time.strftime("%Y-%m-%d"))+'/' # dir where all outputs go; it can be dated as above #NBLOCKS = 100 # pricing data are very long queries; they need to be partitioned in blocks # as a separate project, optimize queries # # # pylint: disable=too-many-branches # pylint: disable=too-many-statements # pylint: disable=too-many-locals # top = top_ date_from = start_date_ nblocks = nblocks_ cwd = os.path.realpath(os.path.dirname(__file__)) #os.getcwd() # ./ working_dir = working_dir_ # empty the whole working dir for root, dirs, files in os.walk(working_dir): for f_f in files: os.unlink(os.path.join(root, f_f)) for d_d in dirs: shutil.rmtree(os.path.join(root, d_d)) shutil.copy(cwd+'/master_file_joe.csv', working_dir) # database = 'qai' server = 'cd5m7wkqacpdeus2mia12301.public.dabc3424290b.database.windows.net,3342' username = '******' password = '******' #Authentication: SQL Server Authentication # NOTE: The following works on a Mac with the MSSQL 13 driver installed - it is here as the # default because Art's Anaconda environment doesn't show a non-empty list of drivers from # pyodbc driver = '/usr/local/lib/libmsodbcsql.13.dylib' # '{ODBC Driver 13 for SQL Server}' drivers = [item for item in pyodbc.drivers()] if drivers: driver = drivers[0] #print('driver:{}'.format(driver)) # cnxn = pyodbc.connect('DRIVER=' + driver + ';SERVER=' + server + ';PORT=1433;DATABASE=' + database + ';UID=' + username + ';PWD=' + password) cursor_ = cnxn.cursor() refinitiv_data_n_columns = 8 s_s = "" if top is not None: s_s = ''' TOP '''+str(top) query = '''SELECT'''+s_s+''' A.SecCode , MR1.ID,MR1.NAME AS CURRNAME , G1.ISSUER AS PITISSUER,G1.EXCHANGE , MR1.Country , G1.StartDate , G1.EndDate , K1.TICKER , G1.EXCHANGE , I.ISSUER AS CURRENTISSUE , I.STATUS , I.SECTYPE AS CURRSECTYPE FROM SecMstrX A JOIN SECMAPX M ON M.SECCODE = A.SECCODE AND M.VenType = 1 -- IDC AND TYPE_ = 1 -- NorthAmer Equity AND M.EXCHANGE <> 2 -- AND M.RANK = 1 -- VIEW ALL (commented out) OR CURRENT ONLY -- AND A.COUNTRY = 'USA' -- comment this out for ADR's JOIN Prc.PrcTKChg K ON M.VENCODE = K.Code JOIN PRC.PRcsCCHG G ON G.CODE = K.CODE AND ISNULL(G.ENDDATE,'1/1/2059') BETWEEN K.STARTDATE AND ISNULL(K.ENDDATE,'1/1/2059') --JOIN PRCCODE2 Y --ON Y.TYPE_ = 2 AND ASCII(G.EXCHANGE) = Y.CODE JOIN PRC.PRCINFO I ON I.CODE = G.CODE AND I.SECTYPE NOT IN ('X','P','E','I','S','U','W','0','7','T','Q','R','V') JOIN SECMAPX MP1 ON MP1.VENCODE = I.CODE AND MP1.RANK = M.RANK AND MP1.VENTYPE = 1 AND MP1.EXCHANGE = M.EXCHANGE JOIN SECMSTRX MR1 ON MR1.SECCODE = MP1.SECCODE AND MR1.TYPE_ = 1 JOIN SECMAPX MP2 ON MP2.SECCODE = MR1.SECCODE AND MP2.VENTYPE = 1 AND MP2.RANK = M.RANK JOIN PRC.PRCTKCHG K1 ON K1.CODE = MP2.VENCODE --AND ISNULL(K1.ENDDATE,'1/1/2059') BETWEEN K.STARTDATE AND ISNULL(K.ENDDATE,'1/1/2059') JOIN PRC.PRCSCCHG G1 ON G1.CODE = K1.CODE AND ISNULL(G1.ENDDATE,'1/1/2059') BETWEEN K1.STARTDATE AND ISNULL(K1.ENDDATE,'1/1/2059') GROUP BY A.SecCode , MR1.ID , MR1.NAME , G1.ISSUER , G1.EXCHANGE , MR1.Country , G1.StartDate , G1.EndDate , K1.TICKER , G1.EXCHANGE , I.ISSUER , I.STATUS , I.SECTYPE ORDER BY MR1.ID , G1.STARTDATE ''' # output the query string to a file with open(working_dir+'query_master_table.txt', "w") as query_file: query_file.write(query) print('\n\nexecuting the query ... ', datetime.now()) try: print('\n\ntrying to execute cursor_.execute(query) ...', datetime.now()) cursor_.execute(query) except Exception as err: print('\n\nexception #1 for cursor_.execute(query)', err, datetime.now()) print('\n\nfetching query result ... ', datetime.now()) try: print('\n\ntrying to execute result = cursor_.fetchall()...', datetime.now()) result = cursor_.fetchall() except Exception as err: print('\n\nexception #2 for result = cursor_.fetchall()', err, datetime.now()) tickers = [] print('\n\nwriting .csv file (master table) ... ', datetime.now()) with tqdm(total=len(result), file=sys.stdout) as pbar: TABLE_MASTER = [] TABLE_MERGED = [] for row in result: pbar.set_description('progress at %s' % datetime.now()) pbar.update(1) row1 = [] row3 = [] date_to = datetime.date(datetime.now()) if row[7] is not None: # to date_to = datetime.date(row[7]) else: date_to = datetime.date(datetime.now()) if date_to > datetime.date(datetime.now()): date_to = datetime.date(datetime.now()) # row1.append(str(row[8])) # ticker tickers.append(row[8]) row1.append(str(row[3])) # point-in-time name row1.append(str(date_to)) # to # row1.append(str(row[0])) # SecCode row3.append(int(row[0])) # int for sorting row1.append(datetime.date(row[6])) # from row3.append(datetime.date(row[6])) row1.append(date_to) # to row3.append(date_to) row1.append(str(row[3])) # point-in-time name row3.append(str(row[3])) row1.append(str(row[8])) # ticker row3.append(str(row[8])) row1.append(str(row[5])) # country row3.append(str(row[5])) row1.append(str(row[2])) # current name row3.append(str(row[2])) row1.append(str(row[12])) # type row3.append(str(row[12])) if row1 not in TABLE_MERGED: TABLE_MERGED.append(row1) if row3 not in TABLE_MERGED: TABLE_MASTER.append(row3) with open(working_dir+'master_table.csv', 'w') as result_file: TABLE_MASTER1 = [] TABLE_MASTER1.append(create_titles([ 'SecCode' , 'From' , 'To' , 'Point-in-time name' , 'Ticker' , 'Country' , 'Current name' , 'Type' ])) TABLE_MASTER = sorted(TABLE_MASTER, key=lambda item: item[0]) # sorted(TABLE_MASTER, key=operator.itemgetter(0)) TABLE_MASTER1 += TABLE_MASTER WR = csv.writer(result_file, dialect='excel') print("HERE") WR.writerows(TABLE_MASTER1) print('\n\npost-processing 1 ... ', datetime.now()) with open(working_dir+'master_file_joe.csv', 'r') as csv_file: CSV_READER = csv.reader(csv_file, delimiter=',') NROW = 0 for row in CSV_READER: row1 = [] # change True to False to use the list if (str(row[3]) in ('C', 'BAC', 'AAPL') or True) and NROW != 0: # skip titles row1.append(str(row[3])) row1.append(str(row[4])) row1.append(str(row[2])) for _ in range(refinitiv_data_n_columns): row1.append('') # fill in with blanks for merged .csv for r in row: row1.append(r) TABLE_MERGED.append(row1) NROW += 1 print('\n\npost-processing 2 ... ', datetime.now()) with open(working_dir+'master_table_merged_art_vs_joe.csv', 'w') as result_file: WR = csv.writer(result_file, dialect='excel') TABLE_MERGED1 = sorted(TABLE_MERGED, key=operator.itemgetter(0, 1, 2)) TABLE_MERGED2 = [] TABLE_MERGED2.append(create_titles([ '' , '' , '' , 'SecCode' , 'From' , 'To' , 'Point-in-time name' , 'Ticker' , 'Country' , 'Current name' , 'Type' , 'ID' , 'FROM' , 'TO' , 'TICKER' , 'NAME' , 'TYPE' ])) TABLE_MERGED2 += TABLE_MERGED1 WR.writerows(TABLE_MERGED2) print('\n\npost-processing 3 ... ', datetime.now()) TICKERS_JOE = [] # this should be an array of unique tickers i = 0 with open(working_dir+'master_file_joe.csv', 'r') as csv_file: CSV_READER = csv.reader(csv_file, delimiter=',') for row in CSV_READER: if i != 0: # skip titles at i = 0 if row[3] not in TICKERS_JOE: # unique tickers TICKERS_JOE.append(row[3]) i += 1 TICKERS_ART = [] # this should be an array of unique tickers for t1 in tickers: if t1 not in TICKERS_ART: TICKERS_ART.append(t1) print('\n\nnumber of unique tickers in the master: ', len(TICKERS_ART), datetime.now()) if top is None: print('\n\npost-processing 4 ... ', datetime.now()) MISSING_TICKERS = [] for tj in TICKERS_JOE: if tj not in TICKERS_ART: # unique tickers MISSING_TICKERS.append(tj) MISSING_TICKERS1 = [] for mt in MISSING_TICKERS: if mt not in MISSING_TICKERS1: # unique tickers MISSING_TICKERS1.append(mt) print('\n\nnumber of missing tickers: ', len(MISSING_TICKERS1), datetime.now()) TICKERS_WITHOUT_SUFFIX = [] for mt in MISSING_TICKERS1: if mt.find('.') != -1: mt = mt.split('.')[0] else: mt = mt[:-1] # try to remove the fused suffix for missing tickers if mt not in TICKERS_WITHOUT_SUFFIX: TICKERS_WITHOUT_SUFFIX.append(mt) print('\n\nnumber of missing tickers without suffix: ', len(TICKERS_WITHOUT_SUFFIX), datetime.now()) query = '''SELECT * FROM PRC.PRCSCCHG WHERE TICKER IN (\'''' for tws in TICKERS_WITHOUT_SUFFIX: query += str(tws)+'''\', \'''' query = query[:-3] query += ''')''' try: print('\n\ntrying to execute cursor_.execute(query)...', datetime.now()) cursor_.execute(query) except Exception as err: print('\n\nexception #3 for cursor_.execute(query)', err, datetime.now()) print('\n\nfetching second query result ... ', datetime.now()) try: print('\n\ntrying to execute result = cursor_.fetchall()...', datetime.now()) result = cursor_.fetchall() except Exception as err: print('\n\nexception #4 for result = cursor_.fetchall()', err, datetime.now()) with open(working_dir+'addendum_master_table.csv', 'w') as result_file: TABLE_ADDENDUM = result TABLE_ADDENDUM = sorted(TABLE_ADDENDUM, key=operator.itemgetter(4)) TABLE_ADDENDUM1 = [] TABLE_ADDENDUM1.append(create_titles([ 'SecCode' , 'From' , 'To' , 'CUSIP' , 'Ticker' , 'SEDOL' , 'Issuer' , 'Full ticker' , 'Base ticker' , 'Group' , 'Series' , 'Exchange' ])) TABLE_ADDENDUM1 += TABLE_ADDENDUM WR = csv.writer(result_file, dialect='excel') WR.writerows(TABLE_ADDENDUM1) FOUND_TICKERS = [] for row in result: if str(row[4]) not in FOUND_TICKERS: FOUND_TICKERS.append(str(row[4])) print('\n\nnumber of found tickers: ', len(FOUND_TICKERS), datetime.now()) MISSING_TICKERS2 = [] for mt in MISSING_TICKERS1: wosuffix = mt if wosuffix.find('.') != -1: wosuffix = wosuffix.split('.')[0] else: wosuffix = wosuffix[:-1] # try to remove the fused suffix if wosuffix not in FOUND_TICKERS and mt not in FOUND_TICKERS: # tickers w/o and with suffix MISSING_TICKERS2.append(mt) print('\n\nfinal number of missing tickers: ', len(MISSING_TICKERS2), datetime.now()) print('\n\nwriting missing tickers ... ', datetime.now()) with open(working_dir+'missing_tickers.csv', 'w') as result_file: WR = csv.writer(result_file, dialect='excel') MISSING_TICKERS2.sort() MISSING_TICKERS3 = [] for row in MISSING_TICKERS2: with open(working_dir+'master_file_joe.csv', 'r') as csv_file: CSV_READER = csv.reader(csv_file, delimiter=',') i = 0 for row2 in CSV_READER: if row2[3] == row and i != 0: # skip titles at i = 0 row5 = [] row5.append(str(row2[3])) row5.append(str(row2[4])) if row5 not in MISSING_TICKERS3: # unique entries MISSING_TICKERS3.append(row5) i += 1 MISSING_TICKERS4 = [] MISSING_TICKERS4.append(create_titles(['Tickers', 'Co. names'])) MISSING_TICKERS4 += MISSING_TICKERS3 WR.writerows(MISSING_TICKERS4) # build objects for missing ticker qqq #i = 0 #for t in MISSING_TICKERS3: # print(t) # T = TickerNeighborhood(ticker=t[0]) # T.current_name = t[1] # print(T) # print(T.ticker) # print(T.name) # list_of_suggested_tickers_for_addendum=[] # list_of_suggested_tickers_for_addendum #=T.analyze_the_neighborhood_of_T_while_keeping_in_mind_joes_master_table #('master_table_joe.csv') print('\n\ndownloading pricing data ... ', datetime.now()) SECCODES = [] with open(working_dir+'master_table.csv') as csv_file: CSV_READER = csv.reader(csv_file, delimiter=',') L = 0 for row in CSV_READER: if row[0] not in SECCODES and L > 0: # skip titles, unique seccodes SECCODES.append(row[0]) L += 1 print('\n\ndistinct seccodes = ', len(SECCODES), datetime.now()) print('\n\nprocessing ...', datetime.now()) query = ''' --This query returns the fully adjusted Open, High, Low, and Close Pricing data in Local Currency using the Ds2Primqtprc table for North American Equities*/ SELECT DISTINCT A.SecCode , MR1.ID,MR1.NAME AS CURRNAME , G1.ISSUER AS PITISSUER,G1.EXCHANGE , MR1.Country , G1.StartDate , G1.EndDate , K1.TICKER , G1.EXCHANGE , I.ISSUER AS CURRENTISSUE , I.STATUS , I.SECTYPE AS CURRSECTYPE , C1.TotRet , C1.* FROM SecMstrX A JOIN SECMAPX M ON M.SECCODE = A.SECCODE AND M.VenType = 1 -- IDC AND TYPE_ = 1 -- NorthAmer Equity AND M.EXCHANGE <> 2 -- AND M.EXCHANGE = 1 AND A.TYPE_ = 1 -- AND M.RANK = 1 -- VIEW ALL OR CURRENT ONLY -- AND A.COUNTRY = 'USA' -- comment this out for ADR's JOIN Prc.PrcTKChg K ON M.VENCODE = K.Code JOIN PRC.PRcsCCHG G ON G.CODE = K.CODE AND ISNULL(G.ENDDATE,'1/1/2059') BETWEEN K.STARTDATE AND ISNULL(K.ENDDATE,'1/1/2059') JOIN PRC.PRCINFO I ON I.CODE = G.CODE AND I.SECTYPE NOT IN ('X','P','E','I','S','U','W','0','7','T','Q','R','V') JOIN SECMAPX MP1 ON MP1.VENCODE = I.CODE AND MP1.RANK = M.RANK AND MP1.VENTYPE = 1 AND MP1.EXCHANGE = M.EXCHANGE JOIN SECMSTRX MR1 ON MR1.SECCODE = MP1.SECCODE AND MR1.TYPE_ = 1 JOIN SECMAPX MP2 ON MP2.SECCODE = MR1.SECCODE AND MP2.VENTYPE = 1 AND MP2.RANK = M.RANK JOIN PRC.PRCTKCHG K1 ON K1.CODE = MP2.VENCODE --AND ISNULL(K1.ENDDATE,'1/1/2059') BETWEEN K.STARTDATE AND ISNULL(K.ENDDATE,'1/1/2059') JOIN PRC.PRCSCCHG G1 ON G1.CODE = K1.CODE AND ISNULL(G1.ENDDATE,'1/1/2059') BETWEEN K1.STARTDATE AND ISNULL(K1.ENDDATE,'1/1/2059') JOIN PRC.PRCDLY C1 ON C1.CODE = G1.CODE WHERE A.SECCODE IN (''' # BLOCK_SIZE = int(len(SECCODES)/nblocks)+1 with tqdm(total=nblocks, file=sys.stdout) as pbar: TABLE = [] LIST = [[] for n in range(20750101)] for seccodeblock in list(iterutils.chunked_iter(SECCODES, BLOCK_SIZE)): pbar.set_description('progress at %s' % time.strftime("%c")) pbar.update(1) query_SECCODES = '' print('\n\nseccodeblock = ', len(seccodeblock), datetime.now()) for sc in seccodeblock: query_SECCODES += str(sc) + ''',''' query_SECCODES = query_SECCODES[:-1] query_DATE = '''CAST(C1.Date_ AS DATETIME)>= \'''' + date_from + '''\'''' COMPOSED_query = query +\ query_SECCODES + ''')\n\nAND\n\n''' +\ query_DATE + '''\n\nORDER BY C1.Date_''' with open(working_dir+'query_pricing_data.txt', 'w') as query_file: query_file.write(COMPOSED_query) keep_trying_to_query = True result = None # the query might fail because the computer got moved to a different location, # which resulted in IP change; in this case, try to re-open the connection, then re-do the query while keep_trying_to_query: try: print('\n\ntrying to execute cursor_.execute(COMPOSED_query)...', datetime.now()) cursor_.execute(COMPOSED_query) try: print('\n\ntrying to execute result = cursor_.fetchall()...', datetime.now()) result = cursor_.fetchall() keep_trying_to_query = False except Exception as err: try: print('\n\nexception #5 for cursor_.execute(COMPOSED_query)', err, datetime.now()) print('\n\nexception #6 for result = cursor_.fetchall()', err, datetime.now()) cursor_.close() cnxn.close() print("\n\nre-opening server connection...", datetime.now()) cnxn = pyodbc.connect('DRIVER='+driver+ ';SERVER='+server+ ';PORT=1433;DATABASE='+database+ ';UID='+username+ ';PWD='+password) cursor_ = cnxn.cursor() except Exception as err: print('\n\nexception #7 for reconnect', err, datetime.now()) except Exception as err: try: print('\n\nexception #8 for cursor_.execute(COMPOSED_query)', err, datetime.now()) print('\n\nexception #9 for result = cursor_.fetchall()', err, datetime.now()) cursor_.close() cnxn.close() print("\n\nre-opening server connection...", datetime.now()) cnxn = pyodbc.connect('DRIVER='+driver+ ';SERVER='+server+ ';PORT=1433;DATABASE='+database+ ';UID='+username+ ';PWD='+password) cursor_ = cnxn.cursor() except Exception as err: print('\n\nexception #10 for reconnect', err, datetime.now()) # if result is not None: print("\n\nquery produced %d rows" % len(result), datetime.now()) for row in result: row3 = [] row3.append(int(row[0])) # SecCode row3.append(row[8]) # ticker if row[15] is not None: date1 = str(row[15])[:-9] # market date row3.append(date1) else: row3.append('-1.0') if row[16] is not None: row3.append(row[16]) # open else: row3.append('-1.0') if row[17] is not None: row3.append(row[17]) # high else: row3.append('-1.0') if row[18] is not None: row3.append(row[18]) # low else: row3.append('-1.0') if row[19] is not None: row3.append(row[19]) # unadjusted close else: row3.append('-1.0') if row[20] is not None: row3.append(row[20]) # volume else: row3.append('-1.0') if row[21] is not None: row3.append(row[21]) # TotRet else: row3.append('-1.0') if row3 not in TABLE: TABLE.append(row3) idx = int(row[15].strftime('%Y%m%d')) LIST[idx].append(row3) # for i, it in enumerate(LIST): if it: s = str(i) year = s[:-4] month = s[4:-2] day = s[6:] date2 = year+'-'+month+'-'+day table1 = [] table2 = [] table2.append(create_titles([ 'SecCode' , 'Ticker' , 'Date' , 'Open' , 'High' , 'Low' , 'Close, unadjusted' , 'Volume' , 'Total return' ])) for _, item in enumerate(it): if item not in table1: table1.append(item) table1 = sorted(table1, key=operator.itemgetter(0, 1)) table2 += table1 ofp = Path(dir_from_date(date2, 'y', working_dir)+'pricing_data_for_'+date2+'.csv') with open(ofp, 'a') as result_file: wr = csv.writer(result_file, dialect='excel') wr.writerows(table2) # # NOW = str(date.today()) print('\n\ncompressing output and timestamping ... ', datetime.now()) FILE_NAME = 'refinitiv_qa_direct_qai_master_and_pricing_tables_'+NOW print(FILE_NAME, datetime.now()) shutil.make_archive(FILE_NAME, 'zip', working_dir) print('\n\nmoving the data to the timestamped repository ... ', datetime.now()) SRC = cwd data_repo = os.path.join(SRC, 'RefinitivDataRepository') if not os.path.exists(data_repo): os.mkdir(data_repo) if not os.path.isdir(data_repo): raise Exception(f'Data repository is not a directory: {data_repo}') OUTPUT_FILE_STAGING_PATH = os.path.join(SRC, FILE_NAME+'.zip') OUTPUT_FILE_PATH = Path(os.path.join(data_repo, FILE_NAME+'.zip')) print('OUTPUT_FILE_STAGING_PATH = ', OUTPUT_FILE_STAGING_PATH, 'OUTPUT_FILE_PATH', OUTPUT_FILE_PATH) if os.path.isfile(OUTPUT_FILE_STAGING_PATH): if os.path.isfile(OUTPUT_FILE_PATH): new_file_size = os.stat(OUTPUT_FILE_STAGING_PATH).st_size old_file_size = os.stat(OUTPUT_FILE_PATH).st_size print('\n\nnew zip size = ', new_file_size, '\told_file_size = ', old_file_size) if new_file_size > old_file_size: os.remove(OUTPUT_FILE_PATH) shutil.move(OUTPUT_FILE_STAGING_PATH, OUTPUT_FILE_PATH) else: shutil.move(OUTPUT_FILE_STAGING_PATH, OUTPUT_FILE_PATH) if email_notification_: print('\n\nemailing the confirmation and the link to compressed data to the author ... ', datetime.now()) ALERT = '''This is to notify that new compressed data set was uploaded to FORA google drive ...''' EMAIL = 'Alert time: ' + time.strftime("%c") +'\n' + ALERT CLIENT_EMAIL = ['*****@*****.**', '*****@*****.**'] # #{'*****@*****.**', '*****@*****.**', '*****@*****.**'} # MESSAGE = create_message('*****@*****.**',\ # CLIENT_EMAIL, 'Completion alert', EMAIL) yagmail.SMTP('*****@*****.**').send(CLIENT_EMAIL, 'Completion alert', EMAIL) print('\n\nemailed to the user:\n'+ALERT, datetime.now()) print('\n\nexiting ... ', datetime.now())
def create_graph(client=None): # NOTE ON CHUNKING SIGNATURES: # Chunking the gene signatures might not be necessary anymore because the overhead of the dask # scheduler is minimal (cf. blog http://matthewrocklin.com/blog/work/2016/05/05/performant-task-scheduling). # The original behind the decision to implement this was the refuted assumption that fast executing tasks # would greatly be impacted by scheduler overhead. The performance gain introduced by chunking of signatures # seemed to corroborate this assumption. However, the benefit was through less pickling and unpickling of # the motif annotations dataframe as this was not wrapped in a delayed() construct. # When using a distributed scheduler chunking even has a negative impact and is therefore overruled. The # negative impact is due to having these large chunks to be shipped to different workers across cluster nodes. # NOTE ON BROADCASTING DATASET: # There are three large pieces of data that need to be orchestrated between client/scheduler and workers: # 1. In a cluster the motif annotations need to be broadcasted to all nodes. Otherwise # the motif annotations need to wrapped in a delayed() construct to avoid needless pickling and # unpicking between processes. def wrap(data): return client.scatter(data, broadcast=True) if client else delayed(data, pure=True) delayed_or_future_annotations = wrap(motif_annotations) # 2. The databases: these database objects are typically proxies to the data on disk. They only have # the name and location on shared storage as fields. For consistency reason we do broadcast these database # objects to the workers. If we decide to have all information of a database loaded into memory we can still # safely use clusters. #def memoize(db: Type[RankingDatabase]) -> Type[RankingDatabase]: # return MemoryDecorator(db) #delayed_or_future_dbs = list(map(wrap, map(memoize, rnkdbs))) # Check also latest Stackoverflow message: https://stackoverflow.com/questions/50795901/dask-scatter-broadcast-a-list delayed_or_future_dbs = list(map(wrap, rnkdbs)) # 3. The gene signatures: these signatures become large when chunking them, therefore chunking is overruled # when using dask.distributed. # See earlier. # NOTE ON SHARING RANKING DATABASES ACROSS NODES: # Because the frontnodes of the VSC share the staging disk, these databases can be accessed from all nodes # in the cluster and can all use the same path in the configuration file. The RankingDatabase objects shared # from scheduler to workers can therefore be just contain information on database file location. # There might be a need to be able to run on clusters that do not share a network drive. This can be # achieved via by loading all data in from the scheduler and use the broadcasting system to share data # across nodes. The only element that needs to be adapted to cater for this need is loading the databases # in memory on the scheduler via the already available MemoryDecorator for databases. But make sure the # adapt memory limits for workers to avoid "distributed.nanny - WARNING - Worker exceeded 95% memory budget.". # NOTE ON REMOVING I/O CONTENTION: # A potential improvement to reduce I/O contention for this shared drive (accessing the ranking # database) would be to load the database in memory (using the available decorator) for each task. # The penalty of loading the database in memory should be shared across multiple gene signature so # in this case chunking of gene signatures is mandatory to avoid severe performance penalties. # However, because of the memory need of a node running pyscenic is already high (i.e. pre-allocation # of recovery curves - 20K features (max. enriched) * rank_threshold * 8 bytes (float) * num_cores), # this might not be a sound idea to do. # Another approach to overcome the I/O bottleneck in a clustered infrastructure is to assign each cluster # to a different database which is achievable in the dask framework. This approach has of course many # limitations: for 6 database you need at least 6 cores and you cannot take advantage of more # (http://distributed.readthedocs.io/en/latest/locality.html) # NOTE ON REMAINING WARNINGS: # >> distributed.worker - WARNING - Memory use is high but worker has no data to store to disk. # >> Perhaps some other process is leaking memory? Process memory: 1.51 GB -- Worker memory limit: 2.15 GB # My current idea is that this cannot be avoided processing a single module can sometimes required # substantial amount of memory because of pre-allocation of recovery curves (see code notes on how to # mitigate this problem). Setting module_chunksize=1 also limits this problem. # # >> distributed.utils_perf - WARNING - full garbage collections took 10% CPU time recently (threshold: 10%) # The current implementation of module2df removes substantial amounts of memory (i.e. the RCCs) so this might # again be unavoidable. TBI + See following stackoverflow question: # https://stackoverflow.com/questions/47776936/why-is-a-computation-much-slower-within-a-dask-distributed-worker return aggregate_func( (delayed(transform_func) (db, gs_chunk, delayed_or_future_annotations) for db in delayed_or_future_dbs for gs_chunk in chunked_iter(modules, module_chunksize)))
def _distributed_calc(rnkdbs: Sequence[Type[RankingDatabase]], modules: Sequence[Type[GeneSignature]], motif_annotations_fname: str, transform_func: Callable[[Type[RankingDatabase], Sequence[Type[GeneSignature]], str], T], aggregate_func: Callable[[Sequence[T]], T], motif_similarity_fdr: float = 0.001, orthologuous_identity_threshold: float = 0.0, client_or_address='dask_multiprocessing', num_workers=None, module_chunksize=100) -> T: """ Perform a parallelized or distributed calculation, either pruning targets or finding enriched motifs. :param rnkdbs: A sequence of ranking databases. :param modules: A sequence of gene signatures. :param motif_annotations_fname: The filename of the motif annotations to use. :param transform_func: A function having a signature (Type[RankingDatabase], Sequence[Type[GeneSignature]], str) and that returns Union[Sequence[Regulon]],pandas.DataFrame]. :param aggregate_func: A function having a signature: - (Sequence[pandas.DataFrame]) => pandas.DataFrame - (Sequence[Sequence[Regulon]]) => Sequence[Regulon] :param motif_similarity_fdr: The maximum False Discovery Rate to find factor annotations for enriched motifs. :param orthologuous_identity_threshold: The minimum orthologuous identity to find factor annotations for enriched motifs. :param client_or_address: The client of IP address of the scheduler when working with dask. For local multi-core systems 'custom_multiprocessing' or 'dask_multiprocessing' can be supplied. :param num_workers: If not using a cluster, the number of workers to use for the calculation. None of all available CPUs need to be used. :param module_chunksize: The size of the chunk in signatures to use when using the dask framework with the multiprocessing scheduler. :return: A pandas dataframe or a sequence of regulons (depends on aggregate function supplied). """ def is_valid(client_or_address): if isinstance(client_or_address, str) and ((client_or_address in {"custom_multiprocessing", "dask_multiprocessing", "local"}) or IP_PATTERN.fullmatch(client_or_address)): return True elif isinstance(client_or_address, Client): return True return False assert is_valid(client_or_address), "\"{}\"is not valid for parameter client_or_address.".format(client_or_address) if client_or_address not in {'custom_multiprocessing', 'dask_multiprocessing'}: module_chunksize = 1 # Make sure warnings and info are being logged. if not len(LOGGER.handlers): LOGGER.addHandler(create_logging_handler(False)) if LOGGER.getEffectiveLevel() > logging.INFO: LOGGER.setLevel(logging.INFO) if client_or_address == 'custom_multiprocessing': # CUSTOM parallelized implementation. # This implementation overcomes the I/O-bounded performance. Each worker (subprocess) loads a dedicated ranking # database and motif annotation table into its own memory space before consuming module. The implementation of # each worker uses the AUC-first numba JIT based implementation of the algorithm. assert len(rnkdbs) <= num_workers if num_workers else cpu_count(), "The number of databases is larger than the number of cores." amplifier = int((num_workers if num_workers else cpu_count())/len(rnkdbs)) LOGGER.info("Using {} workers.".format(len(rnkdbs) * amplifier)) receivers = [] for db in rnkdbs: for idx, chunk in enumerate(chunked_iter(modules, ceil(len(modules)/float(amplifier)))): sender, receiver = Pipe() receivers.append(receiver) Worker("{}({})".format(db.name, idx+1), db, chunk, motif_annotations_fname, sender, motif_similarity_fdr, orthologuous_identity_threshold, transform_func).start() # Retrieve the name of the temporary file to which the data is stored. This is a blocking operation. fnames = [recv.recv() for recv in receivers] # Load all data from disk and concatenate. def load(fname): with open(fname, 'rb') as f: return pickle.load(f) try: return aggregate_func(list(map(load, fnames))) finally: # Remove temporary files. for fname in fnames: os.remove(fname) else: # DASK framework. # Load motif annotations. motif_annotations = load_motif_annotations(motif_annotations_fname, motif_similarity_fdr=motif_similarity_fdr, orthologous_identity_threshold=orthologuous_identity_threshold) # Create dask graph. def create_graph(client=None): # NOTE ON CHUNKING SIGNATURES: # Chunking the gene signatures might not be necessary anymore because the overhead of the dask # scheduler is minimal (cf. blog http://matthewrocklin.com/blog/work/2016/05/05/performant-task-scheduling). # The original behind the decision to implement this was the refuted assumption that fast executing tasks # would greatly be impacted by scheduler overhead. The performance gain introduced by chunking of signatures # seemed to corroborate this assumption. However, the benefit was through less pickling and unpickling of # the motif annotations dataframe as this was not wrapped in a delayed() construct. # When using a distributed scheduler chunking even has a negative impact and is therefore overruled. The # negative impact is due to having these large chunks to be shipped to different workers across cluster nodes. # NOTE ON BROADCASTING DATASET: # There are three large pieces of data that need to be orchestrated between client/scheduler and workers: # 1. In a cluster the motif annotations need to be broadcasted to all nodes. Otherwise # the motif annotations need to wrapped in a delayed() construct to avoid needless pickling and # unpicking between processes. def wrap(data): return client.scatter(data, broadcast=True) if client else delayed(data, pure=True) delayed_or_future_annotations = wrap(motif_annotations) # 2. The databases: these database objects are typically proxies to the data on disk. They only have # the name and location on shared storage as fields. For consistency reason we do broadcast these database # objects to the workers. If we decide to have all information of a database loaded into memory we can still # safely use clusters. #def memoize(db: Type[RankingDatabase]) -> Type[RankingDatabase]: # return MemoryDecorator(db) #delayed_or_future_dbs = list(map(wrap, map(memoize, rnkdbs))) # Check also latest Stackoverflow message: https://stackoverflow.com/questions/50795901/dask-scatter-broadcast-a-list delayed_or_future_dbs = list(map(wrap, rnkdbs)) # 3. The gene signatures: these signatures become large when chunking them, therefore chunking is overruled # when using dask.distributed. # See earlier. # NOTE ON SHARING RANKING DATABASES ACROSS NODES: # Because the frontnodes of the VSC share the staging disk, these databases can be accessed from all nodes # in the cluster and can all use the same path in the configuration file. The RankingDatabase objects shared # from scheduler to workers can therefore be just contain information on database file location. # There might be a need to be able to run on clusters that do not share a network drive. This can be # achieved via by loading all data in from the scheduler and use the broadcasting system to share data # across nodes. The only element that needs to be adapted to cater for this need is loading the databases # in memory on the scheduler via the already available MemoryDecorator for databases. But make sure the # adapt memory limits for workers to avoid "distributed.nanny - WARNING - Worker exceeded 95% memory budget.". # NOTE ON REMOVING I/O CONTENTION: # A potential improvement to reduce I/O contention for this shared drive (accessing the ranking # database) would be to load the database in memory (using the available decorator) for each task. # The penalty of loading the database in memory should be shared across multiple gene signature so # in this case chunking of gene signatures is mandatory to avoid severe performance penalties. # However, because of the memory need of a node running pyscenic is already high (i.e. pre-allocation # of recovery curves - 20K features (max. enriched) * rank_threshold * 8 bytes (float) * num_cores), # this might not be a sound idea to do. # Another approach to overcome the I/O bottleneck in a clustered infrastructure is to assign each cluster # to a different database which is achievable in the dask framework. This approach has of course many # limitations: for 6 database you need at least 6 cores and you cannot take advantage of more # (http://distributed.readthedocs.io/en/latest/locality.html) # NOTE ON REMAINING WARNINGS: # >> distributed.worker - WARNING - Memory use is high but worker has no data to store to disk. # >> Perhaps some other process is leaking memory? Process memory: 1.51 GB -- Worker memory limit: 2.15 GB # My current idea is that this cannot be avoided processing a single module can sometimes required # substantial amount of memory because of pre-allocation of recovery curves (see code notes on how to # mitigate this problem). Setting module_chunksize=1 also limits this problem. # # >> distributed.utils_perf - WARNING - full garbage collections took 10% CPU time recently (threshold: 10%) # The current implementation of module2df removes substantial amounts of memory (i.e. the RCCs) so this might # again be unavoidable. TBI + See following stackoverflow question: # https://stackoverflow.com/questions/47776936/why-is-a-computation-much-slower-within-a-dask-distributed-worker return aggregate_func( (delayed(transform_func) (db, gs_chunk, delayed_or_future_annotations) for db in delayed_or_future_dbs for gs_chunk in chunked_iter(modules, module_chunksize))) # Compute dask graph ... if client_or_address == "dask_multiprocessing": # ... via multiprocessing. return create_graph().compute(scheduler='processes', num_workers=num_workers if num_workers else cpu_count()) else: # ... via dask.distributed framework. client, shutdown_callback = _prepare_client(client_or_address, num_workers=num_workers if num_workers else cpu_count()) try: return client.compute(create_graph(client), sync=True) finally: shutdown_callback(False)
def invoke_semgrep( config_specifier: str, committed_datetime: Optional[datetime], base_commit_ref: Optional[str], semgrep_ignore: TextIO, ) -> FindingSets: debug_echo("=== adding semgrep configuration") workdir = Path.cwd() targets = TargetFileManager( base_path=workdir, base_commit=base_commit_ref, paths=[workdir], ignore_rules_file=semgrep_ignore, ) config_args = ["--config", config_specifier] debug_echo("=== seeing if there are any findings") finding_set = FindingSets() with targets.current_paths() as paths: click.echo("=== looking for current issues in " + unit_len(paths, "file"), err=True) for chunk in chunked_iter(paths, PATHS_CHUNK_SIZE): args = ["--skip-unknown-extensions", "--json", *config_args] for path in chunk: args.append(path) count = 0 for result in json.loads(str(semgrep(*args)))["results"]: finding_set.update_current(result, committed_datetime) count += 1 click.echo( f"| {count} {cardinalize('current issue', count)} found", err=True) if not finding_set.has_current_issues(): click.echo( "=== not looking at pre-existing issues since there are no current issues", err=True, ) else: with targets.baseline_paths() as paths: if paths: paths_with_findings = finding_set.paths_with_current_findings() paths_to_check = set(str(path) for path in paths) & paths_with_findings click.echo( "=== looking for pre-existing issues in " + unit_len(paths_to_check, "file"), err=True, ) for chunk in chunked_iter(paths_to_check, PATHS_CHUNK_SIZE): args = [ "--skip-unknown-extensions", "--json", *config_args ] for path in chunk: args.append(path) count = 0 for result in json.loads(str(semgrep(*args)))["results"]: finding_set.update_baseline(result, committed_datetime) count += 1 click.echo( f"| {count} {cardinalize('pre-existing issue', count)} found", err=True, ) if os.getenv("INPUT_GENERATESARIF"): # FIXME: This will crash when running on thousands of files due to command length limit click.echo("=== re-running scan to generate a SARIF report", err=True) sarif_path = Path("semgrep.sarif") with targets.current_paths() as paths, sarif_path.open( "w") as sarif_file: args = ["--sarif", *config_args] for path in paths: args.extend(["--include", path]) semgrep(*args, _out=sarif_file) rewrite_sarif_file(sarif_path) return finding_set
def batches(self, size): """Iterate all batches. """ for abstracts in chunked_iter(self.abstracts, size): yield Batch(abstracts)
def invoke_semgrep( semgrep_args: List[str], targets: List[str], *, timeout: Optional[int], baseline: bool = False, explicit_semgrepignore_path: Optional[str] = None, ) -> Tuple[int, SemgrepOutput]: """ Call semgrep passing in semgrep_args + targets as the arguments Also, save semgrep output as a list of json blobs in SEMGREP_SAVE_FILE to help debugging. Baseline scan output will be saved separately with the "_baseline" suffix. Returns json output of semgrep as dict object """ max_exit_code = 0 output = SemgrepOutput([], [], SemgrepTiming([], [])) _env = ({ "SEMGREP_R2C_INTERNAL_EXPLICIT_SEMGREPIGNORE": explicit_semgrepignore_path, **os.environ, } if explicit_semgrepignore_path else os.environ) semgrep_save_file_baseline = Path(SEMGREP_SAVE_FILE_BASELINE) if not baseline and semgrep_save_file_baseline.exists(): semgrep_save_file_baseline.unlink() semgrep_save_file_path = (SEMGREP_SAVE_FILE_BASELINE if baseline else SEMGREP_SAVE_FILE) semgrep_save_file = open(semgrep_save_file_path, "w+") semgrep_save_file.write("[") first_chunk = True for chunk in chunked_iter(targets, PATHS_CHUNK_SIZE): with tempfile.NamedTemporaryFile("w") as output_json_file: args = semgrep_args.copy() args.extend(["--debug"]) args.extend([ "-o", output_json_file. name, # nosem: python.lang.correctness.tempfile.flush.tempfile-without-flush ]) for c in chunk: args.append(c) debug_echo(f"== Invoking semgrep with { len(args) } args") exit_code = semgrep_exec(*args, _timeout=timeout, _err=debug_echo, _env=_env).exit_code max_exit_code = max(max_exit_code, exit_code) debug_echo(f"== Semgrep finished with exit code { exit_code }") with open( output_json_file. name # nosem: python.lang.correctness.tempfile.flush.tempfile-without-flush ) as f: semgrep_output = f.read() parsed_output = json.loads(semgrep_output) if first_chunk: first_chunk = False else: semgrep_save_file.write(",") semgrep_save_file.write(semgrep_output) output.results = [*output.results, *parsed_output["results"]] output.errors = [*output.errors, *parsed_output["errors"]] parsed_timing = parsed_output.get("time", {}) output.timing = SemgrepTiming( parsed_timing.get("rules", output.timing.rules), [*output.timing.targets, *parsed_timing.get("targets", [])], ) semgrep_save_file.write("]") semgrep_save_file.close() return max_exit_code, output
def chunks(self, size): return KpaIterable(iterutils.chunked_iter(self, size)) def windows(self, size): return KpaIterable(iterutils.windowed_iter(self, size))
def main(start_date_, working_dir_, nblocks_, email_notification_, top_, archive=False): """ The parametrized main function for CLI in the cloud """ # use the following commands on Mac: # git status; git pull; git add test.py; # git commit -m "Art's SQL/Python script update"; git push # to update the script in the cloud # /anaconda3/bin/python test.py # --start-date 1990-01-01 --working-directory ./temp/ --nblocks 100 --archive # to launch the script (--email-notification -- another flag) # on Mac terminal from the dir where you have test.py # comand line arguments; use comments below as an example # TOP = 10000000 # reduce TOP value to 10 for debugging; put it to inf for a full run # DATE = '2017-01-01' -- 'from' parameter for historical pricing data # WORKING_DIR = './refinitiv_qa_direct_qai_master_and_pricing_tables/'\ # +str(time.strftime("%Y-%m-%d"))+'/' # dir where all outputs go; it can be dated as above # NBLOCKS = 100 # pricing data are very long queries; they need to be partitioned in blocks # as a separate project, optimize queries # # # pylint: disable=too-many-branches # pylint: disable=too-many-statements # pylint: disable=too-many-locals # pylint: disable=too-many-arguments # top = top_ date_from = start_date_ nblocks = nblocks_ cwd = os.path.realpath( os.path.dirname(__file__)) # instead of os.getcwd(), which is './' working_dir = working_dir_ # empty the whole working dir for root, dirs, files in os.walk(working_dir): for f_f in files: os.unlink(os.path.join(root, f_f)) for d_d in dirs: shutil.rmtree(os.path.join(root, d_d)) shutil.copy(os.path.join(cwd, 'master_file_joe.csv'), working_dir) # database = 'qai' server = 'cd5m7wkqacpdeus2mia12301.public.dabc3424290b.database.windows.net,3342' username = '******' password = '******' #Authentication: SQL Server Authentication # NOTE: The following works on a Mac with the MSSQL 13 driver installed - it is here as the # default because Art's Anaconda environment doesn't show a non-empty list of drivers from # pyodbc driver = '/usr/local/lib/libmsodbcsql.13.dylib' # '{ODBC Driver 13 for SQL Server}' drivers = [item for item in pyodbc.drivers()] if drivers: driver = drivers[0] #print('driver:{}'.format(driver)) # cnxn = pyodbc.connect('DRIVER=' + driver + ';SERVER=' + server + ';PORT=1433;DATABASE=' + database + ';UID=' + username + ';PWD=' + password) cursor_ = cnxn.cursor() refinitiv_data_n_columns = 8 s_s = "" if top is not None: s_s = ''' TOP ''' + str(top) query = '''SELECT''' + s_s + ''' A.SecCode -- SecCode -- 0 -- , MR1.ID , MR1.NAME AS CURRNAME -- current name -- 1 , G1.ISSUER AS PITISSUER -- point-in-time name -- 2 -- , G1.EXCHANGE , MR1.Country -- country -- 3 , G1.StartDate -- from -- 4 , G1.EndDate -- to -- 5 , K1.TICKER -- ticker -- 6 -- , G1.EXCHANGE -- , I.ISSUER AS CURRENTISSUE --, I.STATUS , I.SECTYPE AS CURRSECTYPE -- type --7 FROM SecMstrX A JOIN SECMAPX M ON M.SECCODE = A.SECCODE AND M.VenType = 1 -- IDC AND TYPE_ = 1 -- NorthAmer Equity AND M.EXCHANGE <> 2 -- AND M.RANK = 1 -- VIEW ALL (commented out) OR CURRENT ONLY -- AND A.COUNTRY = 'USA' -- comment this out for ADR's JOIN Prc.PrcTKChg K ON M.VENCODE = K.Code JOIN PRC.PRcsCCHG G ON G.CODE = K.CODE AND ISNULL(G.ENDDATE,'1/1/2059') BETWEEN K.STARTDATE AND ISNULL(K.ENDDATE,'1/1/2059') --JOIN PRCCODE2 Y --ON Y.TYPE_ = 2 AND ASCII(G.EXCHANGE) = Y.CODE JOIN PRC.PRCINFO I ON I.CODE = G.CODE AND I.SECTYPE NOT IN ('X','P','E','I','S','U','W','0','7','T','Q','R','V') JOIN SECMAPX MP1 ON MP1.VENCODE = I.CODE AND MP1.RANK = M.RANK AND MP1.VENTYPE = 1 AND MP1.EXCHANGE = M.EXCHANGE JOIN SECMSTRX MR1 ON MR1.SECCODE = MP1.SECCODE AND MR1.TYPE_ = 1 JOIN SECMAPX MP2 ON MP2.SECCODE = MR1.SECCODE AND MP2.VENTYPE = 1 AND MP2.RANK = M.RANK JOIN PRC.PRCTKCHG K1 ON K1.CODE = MP2.VENCODE --AND ISNULL(K1.ENDDATE,'1/1/2059') BETWEEN K.STARTDATE AND ISNULL(K.ENDDATE,'1/1/2059') JOIN PRC.PRCSCCHG G1 ON G1.CODE = K1.CODE AND ISNULL(G1.ENDDATE,'1/1/2059') BETWEEN K1.STARTDATE AND ISNULL(K1.ENDDATE,'1/1/2059') GROUP BY A.SecCode , MR1.ID , MR1.NAME , G1.ISSUER , G1.EXCHANGE , MR1.Country , G1.StartDate , G1.EndDate , K1.TICKER , G1.EXCHANGE , I.ISSUER , I.STATUS , I.SECTYPE ORDER BY MR1.ID , G1.STARTDATE ''' # output the query string to a file with open(os.path.join(working_dir, 'query_master_table.txt'), "w") as query_file: query_file.write(query) print('\n\nexecuting the query ... ', datetime.now()) try: print('\n\ntrying to execute cursor_.execute(query) ...', datetime.now()) cursor_.execute(query) except Exception as err: print('\n\nexception #1 for cursor_.execute(query)', err, datetime.now()) print('\n\nfetching query result ... ', datetime.now()) try: print('\n\ntrying to execute result = cursor_.fetchall()...', datetime.now()) result = cursor_.fetchall() except Exception as err: print('\n\nexception #2 for result = cursor_.fetchall()', err, datetime.now()) tickers = [] print('\n\nwriting .csv file (master table) ... ', datetime.now()) with tqdm(total=len(result), file=sys.stdout) as pbar: table_master = [] table_merged = [] for row in result: pbar.set_description('progress at %s' % datetime.now()) pbar.update(1) row1 = [] row3 = [] # A.SecCode -- SecCode -- 0 #-- , MR1.ID # , MR1.NAME AS CURRNAME -- current name -- 1 # , G1.ISSUER AS PITISSUER -- point-in-time name -- 2 #-- , G1.EXCHANGE # , MR1.Country -- country -- 3 # , G1.StartDate -- from -- 4 # , G1.EndDate -- to -- 5 # , K1.TICKER -- ticker -- 6 #-- , G1.EXCHANGE #-- , I.ISSUER AS CURRENTISSUE #-- , I.STATUS # , I.SECTYPE AS CURRSECTYPE -- type --7 # date_to = datetime.date(datetime.now()) if row[5] is not None: # to date_to = datetime.date(row[5]) else: date_to = datetime.date(datetime.now()) if date_to > datetime.date(datetime.now()): date_to = datetime.date(datetime.now()) # row1.append(str(row[6])) # ticker tickers.append(row[6]) row1.append(str(row[2])) # point-in-time name row1.append(str(date_to)) # to # row1.append(str(row[0])) # SecCode row3.append(int(row[0])) # int for sorting row1.append(datetime.date(row[4])) # from row3.append(datetime.date(row[4])) row1.append(date_to) # to row3.append(date_to) row1.append(str(row[2])) # point-in-time name row3.append(str(row[2])) row1.append(str(row[6])) # ticker row3.append(str(row[6])) row1.append(str(row[3])) # country row3.append(str(row[3])) row1.append(str(row[1])) # current name row3.append(str(row[1])) row1.append(str(row[7])) # type row3.append(str(row[7])) if row1 not in table_merged: table_merged.append(row1) if row3 not in table_merged: table_master.append(row3) with open(os.path.join(working_dir, 'master_table.csv'), 'w') as result_file: table_master1 = [] table_master1.append( create_titles([ 'SecCode', 'From', 'To', 'Point-in-time name', 'Ticker', 'Country', 'Current name', 'Type' ])) table_master = sorted(table_master, key=lambda item: item[0]) table_master1 += table_master w_r = csv.writer(result_file, dialect='excel') w_r.writerows(table_master1) print('\n\npost-processing 1 ... ', datetime.now()) with open(os.path.join(working_dir, 'master_file_joe.csv'), 'r') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') nrow = 0 for row in csv_reader: row1 = [] # change True to False to use the list if (str(row[3]) in ('C', 'BAC', 'AAPL') or True) and nrow != 0: # skip titles row1.append(str(row[3])) row1.append(str(row[4])) row1.append(str(row[2])) for _ in range(refinitiv_data_n_columns): row1.append('') # fill in with blanks for merged .csv for r_r in row: row1.append(r_r) table_merged.append(row1) nrow += 1 print('\n\npost-processing 2 ... ', datetime.now()) with open( os.path.join(working_dir, 'master_table_merged_art_vs_joe.csv'), 'w') as result_file: w_r = csv.writer(result_file, dialect='excel') table_merged1 = sorted(table_merged, key=operator.itemgetter(0, 1, 2)) table_merged2 = [] table_merged2.append( create_titles([ '', '', '', 'SecCode', 'From', 'To', 'Point-in-time name', 'Ticker', 'Country', 'Current name', 'Type', 'ID', 'FROM', 'TO', 'TICKER', 'NAME', 'TYPE' ])) table_merged2 += table_merged1 w_r.writerows(table_merged2) print('\n\npost-processing 3 ... ', datetime.now()) tickers_joe = [] # this should be an array of unique tickers i = 0 with open(os.path.join(working_dir, 'master_file_joe.csv'), 'r') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') for row in csv_reader: if i != 0: # skip titles at i = 0 if row[3] not in tickers_joe: # unique tickers tickers_joe.append(row[3]) i += 1 tikers_art = [] # this should be an array of unique tickers for t_t in tickers: if t_t not in tikers_art: tikers_art.append(t_t) print('\n\nnumber of unique tickers in the master: ', len(tikers_art), datetime.now()) if top is None: print('\n\npost-processing 4 ... ', datetime.now()) missing_tikers = [] for t_j in tickers_joe: if t_j not in tikers_art: # unique tickers missing_tikers.append(t_j) missing_tikers1 = [] for m_t in missing_tikers: if m_t not in missing_tikers1: # unique tickers missing_tikers1.append(m_t) print('\n\nnumber of missing tickers: ', len(missing_tikers1), datetime.now()) tickers_without_suffix = [] for m_t in missing_tikers1: if m_t.find('.') != -1: m_t = m_t.split('.')[0] else: m_t = m_t[: -1] # try to remove the fused suffix for missing tickers if m_t not in tickers_without_suffix: tickers_without_suffix.append(m_t) print('\n\nnumber of missing tickers without suffix: ', len(tickers_without_suffix), datetime.now()) query = '''SELECT * FROM PRC.PRCSCCHG WHERE TICKER IN (\'''' for tws in tickers_without_suffix: query += str(tws) + '''\', \'''' query = query[:-3] query += ''')''' try: print('\n\ntrying to execute cursor_.execute(query)...', datetime.now()) cursor_.execute(query) except Exception as err: print('\n\nexception #3 for cursor_.execute(query)', err, datetime.now()) print('\n\nfetching second query result ... ', datetime.now()) try: print('\n\ntrying to execute result = cursor_.fetchall()...', datetime.now()) result = cursor_.fetchall() except Exception as err: print('\n\nexception #4 for result = cursor_.fetchall()', err, datetime.now()) with open(os.path.join(working_dir, 'addendum_master_table.csv'), 'w') as result_file: table_addendum = result table_addendum = sorted(table_addendum, key=operator.itemgetter(4)) table_addendum1 = [] table_addendum1.append( create_titles([ 'SecCode', 'From', 'To', 'CUSIP', 'Ticker', 'SEDOL', 'Issuer', 'Full ticker', 'Base ticker', 'Group', 'Series', 'Exchange' ])) table_addendum1 += table_addendum w_r = csv.writer(result_file, dialect='excel') w_r.writerows(table_addendum1) found_tickers = [] for row in result: if str(row[4]) not in found_tickers: found_tickers.append(str(row[4])) print('\n\nnumber of found tickers: ', len(found_tickers), datetime.now()) missing_tikers2 = [] for m_t in missing_tikers1: wosuffix = m_t if wosuffix.find('.') != -1: wosuffix = wosuffix.split('.')[0] else: wosuffix = wosuffix[:-1] # try to remove the fused suffix if wosuffix not in found_tickers and m_t not in found_tickers: # tickers w/o and with suffix missing_tikers2.append(m_t) print('\n\nfinal number of missing tickers: ', len(missing_tikers2), datetime.now()) print('\n\nwriting missing tickers ... ', datetime.now()) with open(os.path.join(working_dir, 'missing_tickers.csv'), 'w') as result_file: w_r = csv.writer(result_file, dialect='excel') missing_tikers2.sort() missing_tikers3 = [] for row in missing_tikers2: with open(os.path.join(working_dir, 'master_file_joe.csv'), 'r') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') i = 0 for row2 in csv_reader: if row2[3] == row and i != 0: # skip titles at i = 0 row5 = [] row5.append(str(row2[3])) row5.append(str(row2[4])) if row5 not in missing_tikers3: # unique entries missing_tikers3.append(row5) i += 1 missing_tikers4 = [] missing_tikers4.append(create_titles(['Tickers', 'Co. names'])) missing_tikers4 += missing_tikers3 w_r.writerows(missing_tikers4) print('\n\ndownloading pricing data ... ', datetime.now()) seccodes = [] with open(os.path.join(working_dir, 'master_table.csv')) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') l_l = 0 for row in csv_reader: if row[0] not in seccodes and l_l > 0: # skip titles, unique seccodes seccodes.append(row[0]) l_l += 1 print('\n\ndistinct seccodes = ', len(seccodes), datetime.now()) print('\n\nprocessing ...', datetime.now()) query = ''' --This query returns the fully adjusted Open, High, Low, and Close Pricing data in Local Currency using the Ds2Primqtprc table for North American Equities*/ SELECT DISTINCT A.SecCode -- seccode new col=0 -- , MR1.ID -- , MR1.NAME AS CURRNAME -- , G1.ISSUER AS PITISSUER -- , G1.EXCHANGE -- , MR1.Country -- , G1.StartDate -- , G1.EndDate , K1.TICKER -- ticker new col=1 -- , G1.EXCHANGE -- , I.ISSUER AS CURRENTISSUE -- , I.STATUS -- , I.SECTYPE AS CURRSECTYPE -- , C1.TotRet -- , C1.placeholder , C1.Date_ -- market date col=15; new col=2 , C1.Open_ -- col=16 open; new col=3 , C1.High -- col=17 high; new col=4 , C1.Low -- col=18 low; new col=5 , C1.Close_ -- col=19 close; new col=6 , C1.Volume -- col=20 volume; new col=7 , C1.TotRet -- col=21 totret; new col=8 FROM SecMstrX A JOIN SECMAPX M ON M.SECCODE = A.SECCODE AND M.VenType = 1 -- IDC AND TYPE_ = 1 -- NorthAmer Equity AND M.EXCHANGE <> 2 -- AND M.EXCHANGE = 1 AND A.TYPE_ = 1 -- AND M.RANK = 1 -- VIEW ALL OR CURRENT ONLY -- AND A.COUNTRY = 'USA' -- comment this out for ADR's JOIN Prc.PrcTKChg K ON M.VENCODE = K.Code JOIN PRC.PRcsCCHG G ON G.CODE = K.CODE AND ISNULL(G.ENDDATE,'1/1/2059') BETWEEN K.STARTDATE AND ISNULL(K.ENDDATE,'1/1/2059') JOIN PRC.PRCTKCHG K1 ON K1.CODE = K.CODE JOIN PRC.PRCDLY C1 ON C1.CODE = K1.CODE WHERE A.SECCODE IN (''' # block_size = int(len(seccodes) / nblocks) + 1 with tqdm(total=nblocks, file=sys.stdout) as pbar: list_ = [[] for n in range(20750101)] for seccodeblock in list(iterutils.chunked_iter(seccodes, block_size)): pbar.set_description('progress at %s' % time.strftime("%c")) pbar.update(1) query_seccodes = '' print('\n\nseccodeblock = ', len(seccodeblock), datetime.now()) for s_c in seccodeblock: query_seccodes += str(s_c) + ''',''' query_seccodes = query_seccodes[:-1] query_date = '''CAST(C1.Date_ AS DATETIME)>= \'''' + date_from + '''\'''' composed_query = query +\ query_seccodes + ''')\n\nAND\n\n''' +\ query_date + '''\n\nORDER BY C1.Date_''' with open(os.path.join(working_dir, 'query_pricing_data.txt'), 'w') as query_file: query_file.write(composed_query) keep_trying_to_query = True result = None # the query might fail because the computer got moved to a different location, # which resulted in IP change; in this case, try to re-open the connection, then re-do the query while keep_trying_to_query: try: print( '\n\ntrying to execute cursor_.execute(COMPOSED_query)...', datetime.now()) cursor_.execute(composed_query) try: print( '\n\ntrying to execute result = cursor_.fetchall()...', datetime.now()) result = cursor_.fetchall() keep_trying_to_query = False except Exception as err: try: print( '\n\nexception #5 for cursor_.execute(COMPOSED_query)', err, datetime.now()) print( '\n\nexception #6 for result = cursor_.fetchall()', err, datetime.now()) cursor_.close() cnxn.close() print("\n\nre-opening server connection...", datetime.now()) cnxn = pyodbc.connect('DRIVER=' + driver + ';SERVER=' + server + ';PORT=1433;DATABASE=' + database + ';UID=' + username + ';PWD=' + password) cursor_ = cnxn.cursor() except Exception as err: print('\n\nexception #7 for reconnect', err, datetime.now()) except Exception as err: try: print( '\n\nexception #8 for cursor_.execute(COMPOSED_query)', err, datetime.now()) print( '\n\nexception #9 for result = cursor_.fetchall()', err, datetime.now()) cursor_.close() cnxn.close() print("\n\nre-opening server connection...", datetime.now()) cnxn = pyodbc.connect('DRIVER=' + driver + ';SERVER=' + server + ';PORT=1433;DATABASE=' + database + ';UID=' + username + ';PWD=' + password) cursor_ = cnxn.cursor() except Exception as err: print('\n\nexception #10 for reconnect', err, datetime.now()) # if result is not None: print("\n\nquery produced %d rows" % len(result), datetime.now()) for row in result: row3 = [] # A.SecCode -- seccode new col=0 # -- , MR1.ID # -- , MR1.NAME AS CURRNAME # -- , G1.ISSUER AS PITISSUER # -- , G1.EXCHANGE # -- , MR1.Country # -- , G1.StartDate # -- , G1.EndDate # , K1.TICKER -- ticker new col=1 # -- , G1.EXCHANGE # -- , I.ISSUER AS CURRENTISSUE # -- , I.STATUS # -- , I.SECTYPE AS CURRSECTYPE # -- , C1.TotRet # -- , C1.placeholder # , C1.MarketDate -- market date col=15; new col=2 # , C1.Open -- col=16 open; new col=3 # , C1.High -- col=17 high; new col=4 # , C1.Low -- col=18 low; new col=5 # , C1.Close -- col=19 close; new col=6 # , C1.Volume -- col=20 volume; new col=7 # , C1.TotRet -- col=21 totret; new col=8 # row3.append(int(row[0])) # SecCode row3.append(row[1]) # ticker if row[2] is not None: date1 = str(row[2])[:-9] # market date row3.append(date1) else: row3.append('-1.0') if row[3] is not None: row3.append(row[3]) # open else: row3.append('-1.0') if row[4] is not None: row3.append(row[4]) # high else: row3.append('-1.0') if row[5] is not None: row3.append(row[5]) # low else: row3.append('-1.0') if row[6] is not None: row3.append(row[6]) # unadjusted close else: row3.append('-1.0') if row[7] is not None: row3.append(row[7]) # volume else: row3.append('-1.0') if row[8] is not None: row3.append(row[8]) # TotRet else: row3.append('-1.0') idx = int(row[2].strftime('%Y%m%d')) if row3 not in list_[idx]: list_[idx].append(row3) # for i, i_t in enumerate(list_): if i_t: s_s = str(i) year = s_s[:-4] month = s_s[4:-2] day = s_s[6:] date2 = year + '-' + month + '-' + day table1 = [] table2 = [] table2.append( create_titles([ 'SecCode', 'Ticker', 'Date', 'Open', 'High', 'Low', 'Close, unadjusted', 'Volume', 'Total return' ])) for _, item in enumerate(i_t): if item not in table1: table1.append(item) table1 = sorted(table1, key=operator.itemgetter(0, 1)) table2 += table1 ofp = os.path.join(dir_from_date(date2, 'ym', working_dir), date2 + '.csv') with open(ofp, 'a') as result_file: w_r = csv.writer(result_file, dialect='excel') w_r.writerows(table2) # if archive: now = str(date.today()) print('\n\ncompressing output and timestamping ... ', datetime.now()) file_name = 'refinitiv_qa_direct_qai_master_and_pricing_tables_' + now print(file_name, datetime.now()) shutil.make_archive(file_name, 'zip', working_dir) print('\n\nmoving the data to the timestamped repository ... ', datetime.now()) src = cwd data_repo = os.path.join(src, 'RefinitivDataRepository') if not os.path.exists(data_repo): os.mkdir(data_repo) if not os.path.isdir(data_repo): raise Exception(f'Data repository is not a directory: {data_repo}') output_file_staging_path = os.path.join(src, file_name + '.zip') output_file_path = Path(os.path.join(data_repo, file_name + '.zip')) print('OUTPUT_FILE_STAGING_PATH = ', output_file_staging_path, 'OUTPUT_FILE_PATH', output_file_path) if os.path.isfile(output_file_staging_path): if os.path.isfile(output_file_path): new_file_size = os.stat(output_file_staging_path).st_size old_file_size = os.stat(output_file_path).st_size print('\n\nnew zip size = ', new_file_size, '\told_file_size = ', old_file_size) if new_file_size > old_file_size: os.remove(output_file_path) shutil.move(output_file_staging_path, output_file_path) else: shutil.move(output_file_staging_path, output_file_path) if email_notification_: print( '\n\nemailing the confirmation and the link to compressed data to the author ... ', datetime.now()) alert = '''This is to notify that new compressed data set was uploaded to FORA google drive ...''' email = 'Alert time: ' + time.strftime("%c") + '\n' + alert client_email = [ '*****@*****.**', '*****@*****.**' ] # MESSAGE = create_message('*****@*****.**',\ # CLIENT_EMAIL, 'Completion alert', EMAIL) yagmail.SMTP('*****@*****.**').send( client_email, 'Completion alert', email) print('\n\nemailed to the user:\n' + alert, datetime.now()) print('\n\nexiting ... ', datetime.now())
def invoke_semgrep( config_specifier: str, committed_datetime: Optional[datetime], base_commit_ref: Optional[str], semgrep_ignore: TextIO, uses_managed_policy: bool, ) -> FindingSets: debug_echo("=== adding semgrep configuration") workdir = Path.cwd() targets = TargetFileManager( base_path=workdir, base_commit=base_commit_ref, paths=[workdir], ignore_rules_file=semgrep_ignore, ) config_args = ["--config", config_specifier] rewrite_args = ["--no-rewrite-rule-ids"] if uses_managed_policy else [] debug_echo("=== seeing if there are any findings") findings = FindingSets() with targets.current_paths() as paths: click.echo( "=== looking for current issues in " + unit_len(paths, "file"), err=True ) for chunk in chunked_iter(paths, PATHS_CHUNK_SIZE): args = [ "--skip-unknown-extensions", "--disable-nosem", "--json", *rewrite_args, *config_args, ] for path in chunk: args.append(path) semgrep_results = json.loads(str(semgrep(*args)))["results"] findings.current.update_findings( Finding.from_semgrep_result(result, committed_datetime) for result in semgrep_results if not result["extra"].get("is_ignored") ) findings.ignored.update_findings( Finding.from_semgrep_result(result, committed_datetime) for result in semgrep_results if result["extra"].get("is_ignored") ) click.echo( f"| {unit_len(findings.current, 'current issue')} found", err=True ) click.echo( f"| {unit_len(findings.ignored, 'ignored issue')} found", err=True, ) if not findings.current: click.echo( "=== not looking at pre-existing issues since there are no current issues", err=True, ) else: with targets.baseline_paths() as paths: paths_with_findings = {finding.path for finding in findings.current} paths_to_check = set(str(path) for path in paths) & paths_with_findings if not paths_to_check: click.echo( "=== not looking at pre-existing issues since all files with current issues are newly created", err=True, ) else: click.echo( "=== looking for pre-existing issues in " + unit_len(paths_to_check, "file"), err=True, ) for chunk in chunked_iter(paths_to_check, PATHS_CHUNK_SIZE): args = [ "--skip-unknown-extensions", "--json", *rewrite_args, *config_args, ] for path in chunk: args.append(path) semgrep_results = json.loads(str(semgrep(*args)))["results"] findings.baseline.update_findings( Finding.from_semgrep_result(result, committed_datetime) for result in semgrep_results ) click.echo( f"| {unit_len(findings.baseline, 'pre-existing issue')} found", err=True, ) if os.getenv("INPUT_GENERATESARIF"): # FIXME: This will crash when running on thousands of files due to command length limit click.echo("=== re-running scan to generate a SARIF report", err=True) sarif_path = Path("semgrep.sarif") with targets.current_paths() as paths, sarif_path.open("w") as sarif_file: args = ["--sarif", *rewrite_args, *config_args] for path in paths: args.extend(["--include", path]) semgrep(*args, _out=sarif_file) rewrite_sarif_file(sarif_path) return findings
def batches_iter(self, batch_size): return chunked_iter(iter(self), batch_size)