def load_dictionary_full(db, dictionaries_dir, change_tracker): print('Loading dictionary_full') dictionary_full_files = list((dictionaries_dir / 'en').glob('*.json')) if not change_tracker.is_any_file_new_or_changed(dictionary_full_files): return dictionary_full_collection = db['dictionary_full'] docs = [] # For now hardcode this, in the future we may have other pairs lang_from = 'pli' lang_to = 'en' words_seen = Counter() ids_seen = Counter() for dictionary in tqdm( dictionary_full_files, desc="Loading dictionaries", ncols=79 ): entries = json_load(dictionary) for entry in entries: word = entry['word'].lower() words_seen[word] += 1 # create a meaningful id word_ascii = asciify_roman(word) _key = regex.sub(r'[^a-z0-9]+', '_', word_ascii) if _key in ids_seen: ids_seen[_key] += 1 _key += str(ids_seen[_key]) else: ids_seen[_key] = 1 words_seen[word] += 1 doc = { '_key': _key, 'dictname': dictionary.stem, 'lang_to': lang_to, 'lang_from': lang_from, **entry, 'word': word, 'word_ascii': word_ascii, } docs.append(doc) seen = set() for doc in docs: if doc['_key'] in seen: print(f'Duplicate: {doc["_key"]}') seen.add(doc['_key']) words_sorted = sorted(words_seen, key=pali_sort_key) word_number = {w: i for i, w in enumerate(words_sorted)} for doc in docs: doc['num'] = word_number[doc['word']] dictionary_full_collection.truncate() for chunk in chunks(docs, 1000): dictionary_full_collection.import_bulk_logged(chunk, on_duplicate="ignore")
def load_complex_dictionaries(db: Database, dictionaries_dir: Path): complex_dictionaries_dir = dictionaries_dir / 'complex' docs = [] words_seen = Counter() ids_seen = Counter() for dictionary in tqdm(complex_dictionaries_dir.glob('**/*.json')): from_lang, to_lang_and_name = dictionary.stem.split('2') to_lang, dict_name = to_lang_and_name.split('_') content = json_load(dictionary) for item in content: word = item['word'] words_seen[word] += 1 # create a meaningful id word_ascii = asciify_roman(word) _key = regex.sub(r'[^a-z0-9]+', '_', word_ascii) if _key in ids_seen: ids_seen[_key] += 1 _key += str(ids_seen[_key]) else: ids_seen[_key] = 1 words_seen[word] += 1 docs.append({ '_key': _key, 'from': from_lang, 'to': to_lang, 'word': item['word'], 'text': item['text'], 'word_ascii': word_ascii, 'dictname': dict_name, }) seen = set() for doc in docs: if doc['_key'] in seen: print(f'Duplicate: {doc["_key"]}') seen.add(doc['_key']) words_sorted = sorted(words_seen, key=pali_sort_key) word_number = {w: i for i, w in enumerate(words_sorted)} for doc in docs: doc['num'] = word_number[doc['word']] collection = db.collection('dictionaries_complex') collection.truncate() for chunk in chunks(docs, 1000): collection.import_bulk(chunk, on_duplicate='ignore')
def get_rect(self, image_stream): result = {"objects":[]} print("=======================================") print("start read {0}".format(time.time())) img_data = utils.readb64(image_stream) img_h = int(img_data.shape[0] / self.scaling_factor) img_w = int(img_data.shape[1] / self.scaling_factor) print("start resize {0}".format(time.time())) resized_img = cv2.resize(img_data, (img_w, img_h), interpolation=cv2.INTER_AREA) print("gen roi {0}".format(time.time())) locations = self.dynamic_gen_roi_pos(resized_img, self.stride) print("location len {0}".format(len(locations))) hog_features = utils.get_hog(resized_img, locations=locations, winSize=(self.roi_w_len, self.roi_h_len)) hog_feature_list = list(utils.chunks(hog_features, int(len(hog_features) / len(locations)))) predict_results = predict.clf.predict(hog_feature_list) predict.captured = False for i in range(0,len(predict_results)): if predict_results[i] == 0: u_h = locations[i][1] d_h = u_h + self.roi_h_len l_w = locations[i][0] r_w = l_w + self.roi_w_len result["objects"].append({ "positions": {"cross": [[l_w * self.scaling_factor, u_h * self.scaling_factor] , [r_w * self.scaling_factor, u_h * self.scaling_factor] , [r_w * self.scaling_factor, d_h * self.scaling_factor] , [l_w * self.scaling_factor, d_h * self.scaling_factor]]}, "attributes": {"status": "Normal"} } ) predict.pre_pos = (l_w, u_h) predict.captured = True break return result
def handle(self, *args, **options): start_time = time.time() input_file = options['input_file'] if options['input_file'] in [None, '' ] or os.path.exists(input_file) is False: print("Unable to locate directory. Abort captain. ABORT !") return print("Importing data from: '{}'".format(input_file)) files_sig = [] with open(input_file, 'r') as f: data = json.load(f) for cve_id in tqdm(data["cves"].keys(), desc="VIA-pre"): try: if options['year'] not in [None, ''] and re.match( r'.*([1-3][0-9]{3})', options['year'] ) is not None and options['year'] != cve_id.split('-')[1]: continue # sync_exploits_fromvia(cve_id, data["cves"][cve_id]) files_sig.append( import_via_task.s( cve_id, data["cves"][cve_id]).set(queue='data')) except Exception: pass pbar = tqdm(total=len(files_sig), desc="VIA-run") for chunk in chunks(files_sig, CHUNK_SIZE): res = group(chunk)() res.get() pbar.update(CHUNK_SIZE) pbar.close() elapsed_time = time.time() - start_time print("Import done! Well done captain.") print("Elapsed time (in seconds): %.3f" % elapsed_time)
for index in parameters: if parameters[index]['from_end']: indices[index] = len(sequence) - parameters[index]['index'] else: indices[index] = parameters[index]['index'] return(indices) r = [] print("Reading source file into in-memory database of barcodes: {}".format(snakemake.input['fastqfile'])) con = sqlite3.connect(":memory:") cur = con.cursor() print("") cur.execute('CREATE TABLE barcodes (seqid TEXT, celbc TEXT, umi TEXT)') # reading from file and writing to database in chunks to save memory fastq_parser = Bank(snakemake.input.fastqfile[0]) for chunk in chunks(fastq_parser, 10000): r = [] for seq in chunk: sequence = seq.sequence.decode("utf-8") indices = parse_indices(sequence, config['params']['barcoding']) umi = sequence[indices["umi_start"]:indices["umi_end"]] cel = sequence[indices["cell_bc_start"]:indices["cell_bc_end"]] seqid = seq.comment.decode("utf-8").split(" ")[0] r.append((seqid, cel, umi)) cur.executemany('INSERT INTO barcodes VALUES (?,?,?)', r) r = None print("Creating index on read indentifiers") cur.execute('CREATE UNIQUE INDEX seqidx ON barcodes (seqid)') print("Writing output file: {}".format(snakemake.output)) fi = open(snakemake.input.samfile, 'r')
def handle(self, *args, **options): start_time = time.time() input_dir = options['input_dir'] last_update = None allowed_feeds = None # Validate options if options['input_dir'] in [None, '' ] or os.path.isdir(input_dir) is False: print("Unable to locate directory. Abort captain. ABORT !") return if options['enumerate_feeds'] is True: for feed_dirname in os.listdir(input_dir): feed_dirname_full = os.path.join(input_dir, feed_dirname, 'data/exploits') if os.path.isdir(feed_dirname_full) and feed_dirname not in [ '__pycache__' ]: print(feed_dirname) return if options['feeds'] not in [None, '']: try: allowed_feeds = options['feeds'].split(',') except Exception: print("Bad feeds format (use commas). Abort captain. ABORT !") return if options['last_update'] not in [None, '']: last_update = "" try: last_update = datetime.strptime(options['last_update'], '%Y-%m-%d') print("Only last updates from {}".format(last_update)) except Exception: print( "Bad datetime format (Use 'YYYY-MM-DD' instead). Abort captain. ABORT !" ) return print("Importing data from: '{}'".format(input_dir)) # Find exploits for feed_dirname in os.listdir(input_dir): if allowed_feeds is not None and feed_dirname not in allowed_feeds: continue print("Checking already submitted files") feed_checksums = get_checksums(input_dir, feed_dirname) already_imported_files = list( DataFeedImport.objects.filter( hash__in=feed_checksums.keys(), has_error=False, type='exploit', source=feed_dirname.lower()).values_list('filename', flat=True)) feed_dirname_full = os.path.join(input_dir, feed_dirname) feed_datadir = os.path.join(feed_dirname_full, 'data') feed_data_exploits_dir = os.path.join(feed_datadir, 'exploits') if os.path.isdir(feed_dirname_full) and os.path.isdir( feed_datadir) and os.path.isdir(feed_data_exploits_dir): feed_files = [] # filenames feed_files_sig = [] # task signatures for year_dir in os.listdir(feed_data_exploits_dir): if options['year'] not in [None, ''] and re.match( r'.*([1-3][0-9]{3})', options['year']) is not None and options[ 'year'] != year_dir.split('/')[-1]: continue year_dir_path = os.path.join(feed_data_exploits_dir, year_dir) for month_dir in os.listdir(year_dir_path): month_dir_path = os.path.join(year_dir_path, month_dir) for day_dir in os.listdir(month_dir_path): day_dir_path = os.path.join( month_dir_path, day_dir) for exploit_filename in os.listdir(day_dir_path): if exploit_filename not in already_imported_files: feed_files.append( os.path.join(day_dir_path, exploit_filename)) for file in tqdm(feed_files, desc="{}-pre".format(feed_dirname)): try: with open(file, 'r') as f: file_data = json.load(f) file_checked_at = datetime.strptime( file_data['checked_at'].split(' ')[0], '%Y-%m-%d') if last_update in [ None, '' ] or last_update < file_checked_at: # import_exploit(file_data) # import_exploit_task.apply_async( # args=[file_data], # queue='data', # retry=False # ) feed_files_sig.append( import_exploit_task.s(file_data).set( queue='data')) except Exception as e: print(e) pbar = tqdm(total=len(feed_files_sig), desc="{}-run".format(feed_dirname)) for chunk in chunks(feed_files_sig, CHUNK_SIZE): res = group(chunk)() res.get() pbar.update(CHUNK_SIZE) pbar.close() elapsed_time = time.time() - start_time print("Import done! Well done captain.") print("Elapsed time (in seconds): %.3f" % elapsed_time)
def generate_relationship_edges(change_tracker, relationship_dir, additional_info_dir, db): relationship_files = list(relationship_dir.glob('*.json')) if not change_tracker.is_any_file_new_or_changed(relationship_files): return print('Generating Parallels') relationship_data = [] for relationship_file in relationship_files: relationship_data.extend(json_load(relationship_file)) uid_matcher = get_uid_matcher(db) remarks_data = json_load(additional_info_dir / 'notes.json') remarks = defaultdict(dict) for remark in remarks_data: uids = remark['relations'] remark_text = remark['remark'] remarks[frozenset(uids)] = remark_text antispam = set() ll_edges = [] for entry in tqdm(relationship_data): entry.pop('remarks', None) for r_type, uids in entry.items(): if r_type == 'retells': r_type = 'retelling' elif r_type == 'mentions': r_type = 'mention' elif r_type == 'parallels': r_type = 'full' if r_type == 'full': full = [uid for uid in uids if not uid.startswith('~')] partial = [uid for uid in uids if uid.startswith('~')] for from_uid in full: m = regex.search('[0-9]+$', from_uid) if m: from_nr = int(m[0]) else: from_nr = 0 true_from_uids = uid_matcher.get_matching_uids(from_uid) if not true_from_uids and ' ' not in from_uid: logging.error( f'Relationship from uid could not be matched: {from_uid} (dropped)' ) continue for to_uids, is_resembling in ((full, False), (partial, True)): for to_uid in to_uids: if to_uid == from_uid: continue true_to_uids = uid_matcher.get_matching_uids( to_uid) if not true_to_uids: logging.info( f'Relationship to uid could not be matched: {to_uid} (appears as orphan)' ) true_to_uids = ['orphan'] for true_from_uid in true_from_uids: for true_to_uid in true_to_uids: remark = remarks.get( frozenset([true_from_uid, true_to_uid]), None) ll_edges.append({ '_from': true_from_uid, '_to': true_to_uid, 'from': from_uid, 'number': from_nr, 'to': to_uid.lstrip('~'), 'type': r_type, 'resembling': is_resembling, 'remark': remark, }) else: first_uid = uids[0] m = regex.search('[0-9]+$', first_uid) if m: from_nr = int(m[0]) else: from_nr = 0 true_first_uids = uid_matcher.get_matching_uids(first_uid) for true_first_uid, to_uid in product(true_first_uids, uids[1:]): true_from_uids = uid_matcher.get_matching_uids(to_uid) if not true_from_uids and ' ' not in from_uid: logging.error( f'Relationship from uid could not be matched: {from_uid} (dropped)' ) continue for true_from_uid in true_from_uids: remark = remarks.get( frozenset([true_from_uid, true_first_uid]), None) ll_edges.append({ '_from': true_first_uid, '_to': true_from_uid, 'from': first_uid.lstrip('~'), 'to': to_uid, 'number': from_nr, 'type': r_type, 'resembling': any( x.startswith('~') for x in [first_uid, from_uid]), 'remark': remark, }) m = regex.search('[0-9]+$', to_uid) if m: to_nr = int(m[0]) else: to_nr = 0 ll_edges.append({ '_from': true_from_uid, '_to': true_first_uid, 'from': to_uid, 'to': first_uid.lstrip('~'), 'number': to_nr, 'type': r_type, 'resembling': any( x.startswith('~') for x in [first_uid, from_uid]), 'remark': remark, }) # Because there are many edges (nearly 400k at last count) chunk the import db['relationship'].truncate() for chunk in chunks(ll_edges, 10000): db['relationship'].import_bulk_logged(chunk, from_prefix='super_nav_details', to_prefix='super_nav_details')
def generate_relationship_edges(change_tracker, relationship_dir, additional_info_dir, db): relationship_files = list(relationship_dir.glob('*.json')) if not change_tracker.is_any_file_new_or_changed(relationship_files): return print('Generating Parallels') relationship_data = [] for relationship_file in relationship_files: relationship_data.extend(json_load(relationship_file)) uid_matcher = get_uid_matcher(db) remarks_data = json_load(additional_info_dir / 'notes.json') remarks = defaultdict(dict) for remark in remarks_data: uids = remark['relations'] remark_text = remark['remark'] remarks[frozenset(uids)] = remark_text antispam = set() ll_edges = [] for entry in tqdm(relationship_data): entry.pop('remarks', None) for r_type, uids in entry.items(): if r_type == 'retells': r_type = 'retelling' elif r_type == 'mentions': r_type = 'mention' elif r_type == 'parallels': r_type = 'full' if r_type == 'full': full = [uid for uid in uids if not uid.startswith('~')] partial = [uid for uid in uids if uid.startswith('~')] for from_uid in full: m = regex.search('[0-9]+$', from_uid) if m: from_nr = int(m[0]) else: from_nr = 0 true_from_uids = uid_matcher.get_matching_uids(from_uid) if not true_from_uids and ' ' not in from_uid: logging.error(f'Relationship from uid could not be matched: {from_uid} (dropped)') continue for to_uids, is_resembling in ((full, False), (partial, True)): for to_uid in to_uids: if to_uid == from_uid: continue true_to_uids = uid_matcher.get_matching_uids(to_uid) if not true_to_uids: logging.error(f'Relationship to uid could not be matched: {to_uid} (appears as orphan)') true_to_uids = ['orphan'] for true_from_uid in true_from_uids: for true_to_uid in true_to_uids: remark = remarks.get(frozenset([true_from_uid, true_to_uid]), None) ll_edges.append({ '_from': true_from_uid, '_to': true_to_uid, 'from': from_uid, 'number': from_nr, 'to': to_uid.lstrip('~'), 'type': r_type, 'resembling': is_resembling, 'remark': remark }) else: first_uid = uids[0] m = regex.search('[0-9]+$', first_uid) if m: from_nr = int(m[0]) else: from_nr = 0 true_first_uids = uid_matcher.get_matching_uids(first_uid) for true_first_uid, to_uid in product(true_first_uids, uids[1:]): true_from_uids = uid_matcher.get_matching_uids(to_uid) if not true_from_uids and ' ' not in from_uid: logging.error(f'Relationship from uid could not be matched: {from_uid} (dropped)') continue for true_from_uid in true_from_uids: remark = remarks.get(frozenset([true_from_uid, true_first_uid]), None) ll_edges.append({ '_from': true_first_uid, '_to': true_from_uid, 'from': first_uid.lstrip('~'), 'to': to_uid, 'number': from_nr, 'type': r_type, 'resembling': any(x.startswith('~') for x in [first_uid, from_uid]), 'remark': remark }) m = regex.search('[0-9]+$', to_uid) if m: to_nr = int(m[0]) else: to_nr = 0 ll_edges.append({ '_from': true_from_uid, '_to': true_first_uid, 'from': to_uid, 'to': first_uid.lstrip('~'), 'number': to_nr, 'type': r_type, 'resembling': any(x.startswith('~') for x in [first_uid, from_uid]), 'remark': remark }) # Because there are many edges (nearly 400k at last count) chunk the import db['relationship'].truncate() for chunk in chunks(ll_edges, 10000): db['relationship'].import_bulk(chunk, from_prefix='root/', to_prefix='root/')