Ejemplo n.º 1
0
def load_dictionary_full(db, dictionaries_dir, change_tracker):
    print('Loading dictionary_full')

    dictionary_full_files = list((dictionaries_dir / 'en').glob('*.json'))
    if not change_tracker.is_any_file_new_or_changed(dictionary_full_files):
        return

    dictionary_full_collection = db['dictionary_full']
    docs = []
    # For now hardcode this, in the future we may have other pairs
    lang_from = 'pli'
    lang_to = 'en'
    words_seen = Counter()
    ids_seen = Counter()
    for dictionary in tqdm(
        dictionary_full_files, desc="Loading dictionaries", ncols=79
    ):
        entries = json_load(dictionary)
        for entry in entries:
            word = entry['word'].lower()
            words_seen[word] += 1

            # create a meaningful id
            word_ascii = asciify_roman(word)
            _key = regex.sub(r'[^a-z0-9]+', '_', word_ascii)
            if _key in ids_seen:
                ids_seen[_key] += 1
                _key += str(ids_seen[_key])
            else:
                ids_seen[_key] = 1
            words_seen[word] += 1

            doc = {
                '_key': _key,
                'dictname': dictionary.stem,
                'lang_to': lang_to,
                'lang_from': lang_from,
                **entry,
                'word': word,
                'word_ascii': word_ascii,
            }
            docs.append(doc)

    seen = set()
    for doc in docs:
        if doc['_key'] in seen:
            print(f'Duplicate: {doc["_key"]}')
            seen.add(doc['_key'])

    words_sorted = sorted(words_seen, key=pali_sort_key)
    word_number = {w: i for i, w in enumerate(words_sorted)}

    for doc in docs:
        doc['num'] = word_number[doc['word']]

    dictionary_full_collection.truncate()
    for chunk in chunks(docs, 1000):
        dictionary_full_collection.import_bulk_logged(chunk, on_duplicate="ignore")
Ejemplo n.º 2
0
def load_complex_dictionaries(db: Database, dictionaries_dir: Path):
    complex_dictionaries_dir = dictionaries_dir / 'complex'
    docs = []
    words_seen = Counter()
    ids_seen = Counter()
    for dictionary in tqdm(complex_dictionaries_dir.glob('**/*.json')):
        from_lang, to_lang_and_name = dictionary.stem.split('2')
        to_lang, dict_name = to_lang_and_name.split('_')
        content = json_load(dictionary)
        for item in content:
            word = item['word']
            words_seen[word] += 1

            # create a meaningful id
            word_ascii = asciify_roman(word)
            _key = regex.sub(r'[^a-z0-9]+', '_', word_ascii)
            if _key in ids_seen:
                ids_seen[_key] += 1
                _key += str(ids_seen[_key])
            else:
                ids_seen[_key] = 1
            words_seen[word] += 1

            docs.append({
                '_key': _key,
                'from': from_lang,
                'to': to_lang,
                'word': item['word'],
                'text': item['text'],
                'word_ascii': word_ascii,
                'dictname': dict_name,
            })

    seen = set()
    for doc in docs:
        if doc['_key'] in seen:
            print(f'Duplicate: {doc["_key"]}')
            seen.add(doc['_key'])

    words_sorted = sorted(words_seen, key=pali_sort_key)
    word_number = {w: i for i, w in enumerate(words_sorted)}

    for doc in docs:
        doc['num'] = word_number[doc['word']]

    collection = db.collection('dictionaries_complex')
    collection.truncate()
    for chunk in chunks(docs, 1000):
        collection.import_bulk(chunk, on_duplicate='ignore')
Ejemplo n.º 3
0
    def get_rect(self, image_stream):
        result = {"objects":[]}
        print("=======================================")
        print("start read {0}".format(time.time()))
        img_data = utils.readb64(image_stream)
        img_h = int(img_data.shape[0] / self.scaling_factor)
        img_w = int(img_data.shape[1] / self.scaling_factor)
        print("start resize {0}".format(time.time()))
        resized_img = cv2.resize(img_data, (img_w, img_h), interpolation=cv2.INTER_AREA)
        print("gen roi {0}".format(time.time()))
        locations = self.dynamic_gen_roi_pos(resized_img, self.stride)
        print("location len {0}".format(len(locations)))

        hog_features = utils.get_hog(resized_img, locations=locations,
                                    winSize=(self.roi_w_len, self.roi_h_len))
        hog_feature_list = list(utils.chunks(hog_features, int(len(hog_features) / len(locations))))

        predict_results = predict.clf.predict(hog_feature_list)

        predict.captured = False

        for i in range(0,len(predict_results)):
            if predict_results[i] == 0:
                u_h = locations[i][1]
                d_h = u_h + self.roi_h_len
                l_w = locations[i][0]
                r_w = l_w + self.roi_w_len
                result["objects"].append({
                    "positions":
                        {"cross": [[l_w * self.scaling_factor, u_h * self.scaling_factor]
                            , [r_w * self.scaling_factor, u_h * self.scaling_factor]
                            , [r_w * self.scaling_factor, d_h * self.scaling_factor]
                            , [l_w * self.scaling_factor, d_h * self.scaling_factor]]},
                    "attributes":
                        {"status": "Normal"}
                    }
                )
                predict.pre_pos = (l_w, u_h)
                predict.captured = True
                break

        return result
Ejemplo n.º 4
0
    def handle(self, *args, **options):
        start_time = time.time()
        input_file = options['input_file']

        if options['input_file'] in [None, ''
                                     ] or os.path.exists(input_file) is False:
            print("Unable to locate directory. Abort captain. ABORT !")
            return

        print("Importing data from: '{}'".format(input_file))

        files_sig = []

        with open(input_file, 'r') as f:
            data = json.load(f)
            for cve_id in tqdm(data["cves"].keys(), desc="VIA-pre"):

                try:
                    if options['year'] not in [None, ''] and re.match(
                            r'.*([1-3][0-9]{3})', options['year']
                    ) is not None and options['year'] != cve_id.split('-')[1]:
                        continue
                    # sync_exploits_fromvia(cve_id, data["cves"][cve_id])
                    files_sig.append(
                        import_via_task.s(
                            cve_id, data["cves"][cve_id]).set(queue='data'))
                except Exception:
                    pass

        pbar = tqdm(total=len(files_sig), desc="VIA-run")
        for chunk in chunks(files_sig, CHUNK_SIZE):
            res = group(chunk)()
            res.get()
            pbar.update(CHUNK_SIZE)
        pbar.close()

        elapsed_time = time.time() - start_time
        print("Import done! Well done captain.")
        print("Elapsed time (in seconds): %.3f" % elapsed_time)
    for index in parameters:
        if parameters[index]['from_end']:
            indices[index] = len(sequence) - parameters[index]['index']
        else: 
            indices[index] = parameters[index]['index']
    return(indices)

r = []
print("Reading source file into in-memory database of barcodes: {}".format(snakemake.input['fastqfile']))
con = sqlite3.connect(":memory:")
cur = con.cursor()
print("")
cur.execute('CREATE TABLE barcodes (seqid TEXT, celbc TEXT, umi TEXT)')
# reading from file and writing to database in chunks to save memory
fastq_parser = Bank(snakemake.input.fastqfile[0])
for chunk in chunks(fastq_parser, 10000):
    r = []
    for seq in chunk:
        sequence = seq.sequence.decode("utf-8")
        indices = parse_indices(sequence, config['params']['barcoding'])
        umi = sequence[indices["umi_start"]:indices["umi_end"]]
        cel = sequence[indices["cell_bc_start"]:indices["cell_bc_end"]]
        seqid = seq.comment.decode("utf-8").split(" ")[0]
        r.append((seqid, cel, umi))
    cur.executemany('INSERT INTO barcodes VALUES (?,?,?)', r)
r = None
print("Creating index on read indentifiers")
cur.execute('CREATE UNIQUE INDEX seqidx ON barcodes (seqid)')

print("Writing output file: {}".format(snakemake.output))
fi = open(snakemake.input.samfile, 'r') 
    def handle(self, *args, **options):
        start_time = time.time()
        input_dir = options['input_dir']
        last_update = None
        allowed_feeds = None

        # Validate options
        if options['input_dir'] in [None, ''
                                    ] or os.path.isdir(input_dir) is False:
            print("Unable to locate directory. Abort captain. ABORT !")
            return

        if options['enumerate_feeds'] is True:
            for feed_dirname in os.listdir(input_dir):
                feed_dirname_full = os.path.join(input_dir, feed_dirname,
                                                 'data/exploits')
                if os.path.isdir(feed_dirname_full) and feed_dirname not in [
                        '__pycache__'
                ]:
                    print(feed_dirname)
            return

        if options['feeds'] not in [None, '']:
            try:
                allowed_feeds = options['feeds'].split(',')
            except Exception:
                print("Bad feeds format (use commas). Abort captain. ABORT !")
                return

        if options['last_update'] not in [None, '']:
            last_update = ""
            try:
                last_update = datetime.strptime(options['last_update'],
                                                '%Y-%m-%d')
                print("Only last updates from {}".format(last_update))
            except Exception:
                print(
                    "Bad datetime format (Use 'YYYY-MM-DD' instead). Abort captain. ABORT !"
                )
                return

        print("Importing data from: '{}'".format(input_dir))

        # Find exploits
        for feed_dirname in os.listdir(input_dir):
            if allowed_feeds is not None and feed_dirname not in allowed_feeds:
                continue

            print("Checking already submitted files")
            feed_checksums = get_checksums(input_dir, feed_dirname)
            already_imported_files = list(
                DataFeedImport.objects.filter(
                    hash__in=feed_checksums.keys(),
                    has_error=False,
                    type='exploit',
                    source=feed_dirname.lower()).values_list('filename',
                                                             flat=True))

            feed_dirname_full = os.path.join(input_dir, feed_dirname)
            feed_datadir = os.path.join(feed_dirname_full, 'data')
            feed_data_exploits_dir = os.path.join(feed_datadir, 'exploits')
            if os.path.isdir(feed_dirname_full) and os.path.isdir(
                    feed_datadir) and os.path.isdir(feed_data_exploits_dir):
                feed_files = []  # filenames
                feed_files_sig = []  # task signatures
                for year_dir in os.listdir(feed_data_exploits_dir):
                    if options['year'] not in [None, ''] and re.match(
                            r'.*([1-3][0-9]{3})',
                            options['year']) is not None and options[
                                'year'] != year_dir.split('/')[-1]:
                        continue
                    year_dir_path = os.path.join(feed_data_exploits_dir,
                                                 year_dir)
                    for month_dir in os.listdir(year_dir_path):
                        month_dir_path = os.path.join(year_dir_path, month_dir)
                        for day_dir in os.listdir(month_dir_path):
                            day_dir_path = os.path.join(
                                month_dir_path, day_dir)
                            for exploit_filename in os.listdir(day_dir_path):
                                if exploit_filename not in already_imported_files:
                                    feed_files.append(
                                        os.path.join(day_dir_path,
                                                     exploit_filename))

                for file in tqdm(feed_files,
                                 desc="{}-pre".format(feed_dirname)):
                    try:
                        with open(file, 'r') as f:
                            file_data = json.load(f)
                            file_checked_at = datetime.strptime(
                                file_data['checked_at'].split(' ')[0],
                                '%Y-%m-%d')
                            if last_update in [
                                    None, ''
                            ] or last_update < file_checked_at:
                                # import_exploit(file_data)
                                # import_exploit_task.apply_async(
                                #     args=[file_data],
                                #     queue='data',
                                #     retry=False
                                # )
                                feed_files_sig.append(
                                    import_exploit_task.s(file_data).set(
                                        queue='data'))

                    except Exception as e:
                        print(e)

                pbar = tqdm(total=len(feed_files_sig),
                            desc="{}-run".format(feed_dirname))
                for chunk in chunks(feed_files_sig, CHUNK_SIZE):
                    res = group(chunk)()
                    res.get()
                    pbar.update(CHUNK_SIZE)
                pbar.close()

        elapsed_time = time.time() - start_time
        print("Import done! Well done captain.")
        print("Elapsed time (in seconds): %.3f" % elapsed_time)
Ejemplo n.º 7
0
def generate_relationship_edges(change_tracker, relationship_dir,
                                additional_info_dir, db):
    relationship_files = list(relationship_dir.glob('*.json'))

    if not change_tracker.is_any_file_new_or_changed(relationship_files):
        return

    print('Generating Parallels')
    relationship_data = []
    for relationship_file in relationship_files:
        relationship_data.extend(json_load(relationship_file))

    uid_matcher = get_uid_matcher(db)

    remarks_data = json_load(additional_info_dir / 'notes.json')

    remarks = defaultdict(dict)

    for remark in remarks_data:
        uids = remark['relations']
        remark_text = remark['remark']
        remarks[frozenset(uids)] = remark_text

    antispam = set()
    ll_edges = []
    for entry in tqdm(relationship_data):
        entry.pop('remarks', None)
        for r_type, uids in entry.items():
            if r_type == 'retells':
                r_type = 'retelling'
            elif r_type == 'mentions':
                r_type = 'mention'
            elif r_type == 'parallels':
                r_type = 'full'

            if r_type == 'full':
                full = [uid for uid in uids if not uid.startswith('~')]
                partial = [uid for uid in uids if uid.startswith('~')]
                for from_uid in full:
                    m = regex.search('[0-9]+$', from_uid)
                    if m:
                        from_nr = int(m[0])
                    else:
                        from_nr = 0
                    true_from_uids = uid_matcher.get_matching_uids(from_uid)
                    if not true_from_uids and ' ' not in from_uid:
                        logging.error(
                            f'Relationship from uid could not be matched: {from_uid} (dropped)'
                        )
                        continue

                    for to_uids, is_resembling in ((full, False), (partial,
                                                                   True)):
                        for to_uid in to_uids:
                            if to_uid == from_uid:
                                continue
                            true_to_uids = uid_matcher.get_matching_uids(
                                to_uid)
                            if not true_to_uids:
                                logging.info(
                                    f'Relationship to uid could not be matched: {to_uid} (appears as orphan)'
                                )
                                true_to_uids = ['orphan']

                            for true_from_uid in true_from_uids:
                                for true_to_uid in true_to_uids:
                                    remark = remarks.get(
                                        frozenset([true_from_uid,
                                                   true_to_uid]), None)
                                    ll_edges.append({
                                        '_from': true_from_uid,
                                        '_to': true_to_uid,
                                        'from': from_uid,
                                        'number': from_nr,
                                        'to': to_uid.lstrip('~'),
                                        'type': r_type,
                                        'resembling': is_resembling,
                                        'remark': remark,
                                    })
            else:
                first_uid = uids[0]
                m = regex.search('[0-9]+$', first_uid)
                if m:
                    from_nr = int(m[0])
                else:
                    from_nr = 0
                true_first_uids = uid_matcher.get_matching_uids(first_uid)
                for true_first_uid, to_uid in product(true_first_uids,
                                                      uids[1:]):
                    true_from_uids = uid_matcher.get_matching_uids(to_uid)
                    if not true_from_uids and ' ' not in from_uid:
                        logging.error(
                            f'Relationship from uid could not be matched: {from_uid} (dropped)'
                        )
                        continue
                    for true_from_uid in true_from_uids:
                        remark = remarks.get(
                            frozenset([true_from_uid, true_first_uid]), None)
                        ll_edges.append({
                            '_from':
                            true_first_uid,
                            '_to':
                            true_from_uid,
                            'from':
                            first_uid.lstrip('~'),
                            'to':
                            to_uid,
                            'number':
                            from_nr,
                            'type':
                            r_type,
                            'resembling':
                            any(
                                x.startswith('~')
                                for x in [first_uid, from_uid]),
                            'remark':
                            remark,
                        })
                        m = regex.search('[0-9]+$', to_uid)
                        if m:
                            to_nr = int(m[0])
                        else:
                            to_nr = 0
                        ll_edges.append({
                            '_from':
                            true_from_uid,
                            '_to':
                            true_first_uid,
                            'from':
                            to_uid,
                            'to':
                            first_uid.lstrip('~'),
                            'number':
                            to_nr,
                            'type':
                            r_type,
                            'resembling':
                            any(
                                x.startswith('~')
                                for x in [first_uid, from_uid]),
                            'remark':
                            remark,
                        })

    # Because there are many edges (nearly 400k at last count) chunk the import
    db['relationship'].truncate()
    for chunk in chunks(ll_edges, 10000):
        db['relationship'].import_bulk_logged(chunk,
                                              from_prefix='super_nav_details',
                                              to_prefix='super_nav_details')
Ejemplo n.º 8
0
def generate_relationship_edges(change_tracker, relationship_dir, additional_info_dir, db):
    relationship_files = list(relationship_dir.glob('*.json'))

    if not change_tracker.is_any_file_new_or_changed(relationship_files):
        return

    print('Generating Parallels')
    relationship_data = []
    for relationship_file in relationship_files:
        relationship_data.extend(json_load(relationship_file))

    uid_matcher = get_uid_matcher(db)

    remarks_data = json_load(additional_info_dir / 'notes.json')

    remarks = defaultdict(dict)

    for remark in remarks_data:
        uids = remark['relations']
        remark_text = remark['remark']
        remarks[frozenset(uids)] = remark_text

    antispam = set()
    ll_edges = []
    for entry in tqdm(relationship_data):
        entry.pop('remarks', None)
        for r_type, uids in entry.items():
            if r_type == 'retells':
                r_type = 'retelling'
            elif r_type == 'mentions':
                r_type = 'mention'
            elif r_type == 'parallels':
                r_type = 'full'

            if r_type == 'full':
                full = [uid for uid in uids if not uid.startswith('~')]
                partial = [uid for uid in uids if uid.startswith('~')]
                for from_uid in full:
                    m = regex.search('[0-9]+$', from_uid)
                    if m:
                      from_nr = int(m[0])
                    else:
                      from_nr = 0
                    true_from_uids = uid_matcher.get_matching_uids(from_uid)
                    if not true_from_uids and ' ' not in from_uid:
                        logging.error(f'Relationship from uid could not be matched: {from_uid} (dropped)')
                        continue

                    for to_uids, is_resembling in ((full, False), (partial, True)):
                        for to_uid in to_uids:
                            if to_uid == from_uid:
                                continue
                            true_to_uids = uid_matcher.get_matching_uids(to_uid)
                            if not true_to_uids:
                                logging.error(f'Relationship to uid could not be matched: {to_uid} (appears as orphan)')
                                true_to_uids = ['orphan']

                            for true_from_uid in true_from_uids:
                                for true_to_uid in true_to_uids:
                                    remark = remarks.get(frozenset([true_from_uid, true_to_uid]),
                                                         None)
                                    ll_edges.append({
                                        '_from': true_from_uid,
                                        '_to': true_to_uid,
                                        'from': from_uid,
                                        'number': from_nr,
                                        'to': to_uid.lstrip('~'),
                                        'type': r_type,
                                        'resembling': is_resembling,
                                        'remark': remark
                                    })
            else:
                first_uid = uids[0]
                m = regex.search('[0-9]+$', first_uid)
                if m:
                  from_nr = int(m[0])
                else:
                  from_nr = 0
                true_first_uids = uid_matcher.get_matching_uids(first_uid)
                for true_first_uid, to_uid in product(true_first_uids, uids[1:]):
                    true_from_uids = uid_matcher.get_matching_uids(to_uid)
                    if not true_from_uids and ' ' not in from_uid:
                        logging.error(f'Relationship from uid could not be matched: {from_uid} (dropped)')
                        continue
                    for true_from_uid in true_from_uids:
                        remark = remarks.get(frozenset([true_from_uid, true_first_uid]), None)
                        ll_edges.append({
                            '_from': true_first_uid,
                            '_to': true_from_uid,
                            'from': first_uid.lstrip('~'),
                            'to': to_uid,
                            'number': from_nr,
                            'type': r_type,
                            'resembling': any(x.startswith('~') for x in [first_uid, from_uid]),
                            'remark': remark
                        })
                        m = regex.search('[0-9]+$', to_uid)
                        if m:
                          to_nr = int(m[0])
                        else:
                          to_nr = 0
                        ll_edges.append({
                            '_from': true_from_uid,
                            '_to': true_first_uid,
                            'from': to_uid,
                            'to': first_uid.lstrip('~'),
                            'number': to_nr,
                            'type': r_type,
                            'resembling': any(x.startswith('~') for x in [first_uid, from_uid]),
                            'remark': remark
                        })
    
    
    # Because there are many edges (nearly 400k at last count) chunk the import
    db['relationship'].truncate()
    for chunk in chunks(ll_edges, 10000):
        db['relationship'].import_bulk(chunk, from_prefix='root/', to_prefix='root/')