def test_coerce_numeric_values(self): cm = Annotations( "../tests/data/metadata_example.txt", ["text/csv", "text/plain", "text/tab-separated-values"], ) cm.create_data_frame() cm.file = Annotations.coerce_numeric_values(cm.file, cm.annot_types) dtype = cm.file.dtypes[("Average Intensity", "numeric")] self.assertEqual(dtype, np.float) # Test that numeric values wer # Pick a random number between 1 and amount of lines in file ran_num = random.randint(1, 20) for column in cm.file.columns: annot_type = column[1] if annot_type == "numeric": value = str(cm.file[column][ran_num]) print(Decimal(value).as_tuple().exponent) assert ( abs(Decimal(value).as_tuple().exponent) >= self.EXPONENT ), "Numbers did not round to 3 or less decimals places" # Test for string in numeric column cm_has_bad_value = Annotations( "../tests/data/metadata_bad_contains_str_in_numeric_column.txt", ["text/csv", "text/plain", "text/tab-separated-values"], ) cm_has_bad_value.create_data_frame() self.assertRaises( ValueError, Annotations.coerce_numeric_values, cm_has_bad_value.file, cm_has_bad_value.annot_types, )
def apply(binary_path: str, sig_path: str) -> Tuple[int, str]: """ Applies signatures in specified file to specified binary, and writes resulting bndb to disk :param binary_path: path of binary to apply signatures to :param sig_path: path of signature file to read in :return: tuple (int count of function signatures matched, str path to BNDB with tags that was created) """ bv = binja.BinaryViewType.get_view_of_file(binary_path) print("Loaded binary {} into Binary Ninja.".format(binary_path)) functions = hash_all(bv) print("{} functions in binary have been hashed.".format(len(functions))) data = read_json(sig_path) signatures = {} for raw_hash in data: # only bother with functions that actually have tags if len(data[raw_hash]) > 0: signatures[raw_hash] = Annotations(raw_data=data[raw_hash]) print("Signature file {} loaded into memory.".format(sig_path)) num_func_sigs_applied = 0 for function_hash in functions: if function_hash in signatures: tag_function(bv, functions[function_hash], function_hash, signatures) print('Located a match at {}!'.format(function_hash)) num_func_sigs_applied += 1 output_bndb = os.path.join(os.getcwd(), binary_path + '.bndb') print("Writing output Binary Ninja database at {}".format(output_bndb)) bv.create_database(output_bndb) return num_func_sigs_applied, output_bndb
def main(): r""" Runs trcls. """ parser = cli.get_parser() args = parser.parse_args() logger = setup_logging(args) if args.version: print('trcls {}'.format(VERSION)) exit(0) if args.alignment == None or args.features == None: logger.error( 'Both SAM alignment and GTF annotation files must be provided') parser.print_help() exit(1) with open(args.features) as features_file: annotations = Annotations(features_file) with open(args.alignment) as alignment_file: alignments = alignment_file.readlines() headers = filter(lambda l: l.startswith('@'), alignments) headers = map(str.strip, headers) alignments = filter(lambda l: not l.startswith('@'), alignments) transcripts = get_transcripts(alignments, args.skip_tolerance, args.map_tolerance) print('\n'.join(headers)) for transcript in transcripts: transcript.annotate(annotations, args.junction_tolerance) print(transcript)
def test_validate_numeric_annots(self): cluster = Annotations( "../tests/data/cluster_bad_missing_coordinate.txt", TestAnnotations.ALLOWED_FILE_TYPES, ) cluster.create_data_frame() self.assertTrue(cluster.validate_numeric_annots)
def test_merge_df(self): cluster = Clusters( "../tests/data/test_1k_cluster_data.csv", "dec0dedfeed1111111111111", "addedfeed000000000000000", "testCluster", ) cell_metadata_df = Annotations( self.CELL_METADATA_PATH, ["text/csv", "text/plain", "text/tab-separated-values"], ) cell_metadata_df.preprocess() cell_names_cell_metadata_df = np.asarray(cell_metadata_df.file["NAME"]) cell_names_cluster_df = np.asarray(cluster.file["NAME"]) # Cell names found in both cluster and metadata files common_cell_names = cell_names_cluster_df[ np.isin(cell_names_cluster_df, cell_names_cell_metadata_df) ] print(f"common cell names: {common_cell_names}") # Perform merge print(cluster.file[["NAME", "x", "y", "z"]]) cluster.merge_df(cluster.file[["NAME", "x", "y", "z"]], cell_metadata_df.file) # Ensure ONLY common cell names found in cell metadata file and cluster file # are in the newly merged df result = all( cell[0] in common_cell_names for cell in cluster.file["NAME"].values ) self.assertTrue( result, f"Merge was not performed correctly. Merge should be performed on 'NAME'", )
def test_low_mem_artifact(self): # pandas default of low_memory=True allows internal chunking during parsing # causing inconsistent dtype coercion artifact for larger annotation files lmtest = Annotations( "../tests/data/low_mem_unit.txt", ["text/csv", "text/plain", "text/tab-separated-values"], ) lmtest.preprocess() # when low memory=True, the first row in the file would be in the first chunk # and the numeric value was not properly coerced to become a string assert isinstance( lmtest.file["mixed_data"]["group"][0], str ), "numeric value should be coerced to string" # Per SCP-2545 NA values become strings for group annotations. print(lmtest.file["mixed_data"]["group"][2]) print(type(lmtest.file["mixed_data"]["group"][2])) assert isinstance( lmtest.file["mixed_data"]["group"][2], str ), "expect empty cell conversion to NaN is string for group annotation" # numeric value in second chunk should still properly be coerced to string type assert isinstance( lmtest.file["mixed_data"]["group"][32800], str ), "numeric value should be coerced to string"
def __init__(self, image_dir, gt_path, seqname=None, trackers=[]): if (seqname == None): # Assumed data format is /path/to/data/seqname.{csv|txt} self.seqname = gt_path.split('/')[-1][:-4] else: self.seqname = seqname self.gt_annotations = Annotations(gt_path, seqname=seqname) self.tracker_res = {} for i in trackers: try: self.tracker_res[i.name] = i.get_res_of(seqname) except: print(self.seqname, 'not available for', i.name) self.img_dir = image_dir self.images = [ i for i in os.listdir(image_dir) if i[-4:] == '.png' or i[-4:] == '.jpg' ] self.images.sort() height, width, layers = cv2.imread( os.path.join(image_dir, self.images[0])).shape self.height = height self.width = width self.size = (width, height) self.obj_size = self.gt_annotations.obj_size
def __init__(self, cluster_file, cell_metadata_file=None): Annotations.__init__(self, cluster_file, self.ALLOWED_FILE_TYPES) self.preprocess() self.determine_coordinates_and_cell_names() if cell_metadata_file is not None: self.cell_metadata = Annotations(cell_metadata_file, CellMetadata.ALLOWED_FILE_TYPES)
def test_leading_zeros(self): """Ensures leading zeros are not stripped from group annotations""" path = "../tests/data/metadata_convention_with_leading_0s.tsv" annotation = Annotations( path, ["text/csv", "text/plain", "text/tab-separated-values"] ) annotation.preprocess() # Grab value from donor id column. value_with_leading_zeros = annotation.file.iloc[ :, annotation.file.columns.get_level_values(0) == "donor_id" ].values.item(0) self.assertTrue(value_with_leading_zeros.startswith("0"))
def test_duplicate_headers(self): """Annotation headers should not contain duplicate values """ dup_headers = Annotations( "../tests/data/dup_headers_v2.0.0.tsv", ["text/csv", "text/plain", "text/tab-separated-values"], ) self.assertFalse( dup_headers.validate_unique_header(), "Duplicate headers should fail format validation", ) with self.assertRaises(ValueError): dup_headers.preprocess()
def read_tags(bv: Binary_View, hashes: Dict[str, Function]) -> Dict[str, Annotations]: """ Gathers tag locations from every function in the binary. :param bv: BinaryView that contains the analysis results :param hashes: a dictionary mapping hashes to their functions :return: dictionary representing all tags in the current binary """ tagged_dict = {} # TODO: switch to use GetAllTagReferences once it's available in the python API for O(1) access times for hash_value in hashes: function = hashes[hash_value] tagged_dict[hash_value] = Annotations(function=function, bv=bv) return tagged_dict
def test_convert_header_to_multiIndex(self): expected = [ ("Name", "TYPE"), ("X", "numeric"), ("Y", "numeric"), ("Z", "numeric"), ("Average Intensity", "numeric"), ] path = "../tests/data/good_subsample_cluster.csv" annotation = Annotations( path, ["text/csv", "text/plain", "text/tab-separated-values"] ) df = annotation.open_file( path, open_as="dataframe", skiprows=2, names=annotation.headers )[0] new_df = Annotations.convert_header_to_multi_index(df, expected) # Remove white spaces new_df_columns = [tuple(s.strip() for s in y) for y in new_df.columns] self.assertEqual(new_df_columns, expected)
def tosling(self, filename): documents = [] annotations = Annotations(self) input_stats = self.summary.input # Callback that will be invoked for each SLING document that is built. # This could be for each sentence or each document part, as specified. def callback(document): documents.append(document) with open(filename, "r") as f: input_stats.files.increment() lines = f.readlines() for line in lines: annotations.read(line, callback) for document in documents: self._add_output_statistics(document) return documents
def test_header_format(self): """Header rows of metadata file should conform to standard """ error_headers = Annotations( "../tests/data/error_headers_v2.0.0.tsv", ["text/csv", "text/plain", "text/tab-separated-values"], ) self.assertFalse( error_headers.validate_header_keyword(), "Missing NAME keyword should fail format validation", ) self.assertFalse( error_headers.validate_type_keyword(), "Missing TYPE keyword should fail format validation", ) self.assertFalse( error_headers.validate_type_annotations(), "Invalid type annotations should fail format validation", )
def __init__(self, output_dir): self.output_dir = Path(output_dir) self.annotations = Annotations("annotations-bitcoin-0.18.json")
def clean_annotations(self, annotations_file): annotations = Annotations(annotations_file) annotations.clean_annotations()
def import_see_also(self, markdown_dir, annotations_file): annotations = Annotations(annotations_file) annotations.import_see_also(markdown_dir)
def setUp(self): self.df = Annotations( self.CLUSTER_PATH, ["text/csv", "text/plain", "text/tab-separated-values"] )
def flna_annotations(): with open('test/FLNA.gtf') as gtf_file: return Annotations(gtf_file)
output_file = args[4] output_json = {} with open(json_dir + "/info.json", 'r') as f: info_json = json.load(f) output_json.update(info_json) with open(json_dir + "/licenses.json", 'r') as f: licencses_json = json.load(f) output_json.update(licencses_json) with open(json_dir + "/categories.json", 'r') as f: categories_json = json.load(f) output_json.update(categories_json) images = ArcImages(images_dir) images_obj = images.get_obj() images_str = json.dumps(images_obj) images_json = json.loads(images_str) output_json.update(images_json) annos = Annotations(bboxes_dir) annos_obj = annos.get_obj() expand_json(images_obj, annos_obj) annos_str = json.dumps(annos.get_json()) annos_json = json.loads(annos_str) output_json.update(annos_json) with open(output_file, 'w') as f: json.dump(output_json, f)
def parse_and_generate(filename, out_filename=None, init_filename=None, include_paths=[], defines=[]): """ parse the file at filename. if out_filename and init_filename are None:return a tuple containing the generated file's names. otherwise return the generated source code for each """ from os import path if out_filename: out_filename = re.sub(FILENAME_EXPR, out_filename, filename) if init_filename: init_filename = re.sub(FILENAME_EXPR, init_filename, filename) rel_filename = '' if out_filename is None and init_filename is None: rel_filename = re.sub(FILENAME_EXPR, r'\g<basename>.h', path.basename(filename)) else: init_dir = path.dirname(init_filename) rel_filename = path.relpath(out_filename, init_dir) ast, text = parse_jstruct(filename, include_paths=include_paths, defines=defines) annotations = Annotations(text) try: annotations.expand(ast, '<stdin>') except ExpansionError as ex: ex.filename = filename raise prune_ast(ast, '<stdin>') out_ast, init_ast = split_ast(ast) generator = CGenerator() out_result = generator.visit(out_ast) init_result = generator.visit(init_ast) if GUARD_HEADERS_EXPR.search(out_result): out_result = re.sub( GUARD_HEADERS_EXPR, r'\g<0>' + GENERATED, out_result, count=1) + '\n#endif\n' else: out_result = GENERATED + out_result init_result = re.sub(GUARD_HEADERS_EXPR, '', init_result) init_instructions = INIT_INSTRUCTIONS if init_filename and init_filename.endswith( '.h') else '' init_result = GENERATED1NL + init_instructions + INCLUDE_H( rel_filename) + init_result if out_filename: with open(out_filename, 'w') as out_file: out_file.write(out_result) if init_filename: with open(init_filename, 'w') as init_file: init_file.write(init_result) if out_filename is None and init_filename is None: return (out_result, init_result) else: return (out_filename, init_filename)
def show_missing(self, cli, annotations_file): commands = HelpParser().parse_help_overview( CliCaller(cli).help()).flat() annotations = Annotations(annotations_file) annotations.show_missing(commands)
def mark_added(self, annotations_file, version, command): annotations = Annotations(annotations_file) annotations.mark_added(version, command)
def generate_lawbook_gatsby(name): ANNOTATIONS = ANNOTATIONS_MAP.get(name, Annotations(list())) with open(os.path.join(STATIC_DIR, "%s.js" % name), "w+", encoding="utf-8") as fp: fp.write(""" import React from "react" import Norm from "../components/norm" import Abs from "../components/abs" import Sub from "../components/sub" import Section from "../components/section" export default () => ( <div> """) data = read_json(name) section_types = [ ] # how far we are in depth i.e. ["Buch", "Abschnitt", "Titel"] fp.write("<h1>%s</h1>" % name) for entry in data: if entry["type"] == "section": title = entry["title"] section_type = title.split(" ")[0] idx = find(section_type, section_types) if idx == -1: section_types.append(section_type) else: fp.write("</Section>" * (len(section_types) - idx)) section_types = section_types[:idx + 1] fp.write("<Section title={'%s'}>" % title) else: paragraph, title = entry["norm"], entry.get("title", "") if title is None: title = "" # print("Writing %s %s" % (paragraph, name)) fp.write( "<Norm norm={'%s'} title={'%s'} marked={%s}>\n" % (paragraph, title, "true" if ANNOTATIONS.is_marked(paragraph) else "false")) for absatz in entry["paragraphs"]: fp.write("<Abs> %s\n" % absatz["text"]) subs = absatz["sub"] if subs: for i, sub in enumerate(subs): fp.write("<Sub>%d. %s\n" % (i + 1, sub["text"])) subsubs = sub["sub"] if subsubs != []: fp.write("<div class='subsubbox'>\n") letters = lit_gen() for subsub in subsubs: fp.write( "<div class='subsub'>%s) %s</div>\n" % (next(letters), subsub["text"])) fp.write("</div>\n") # .subsubbox fp.write("</Sub>\n") fp.write("</Abs>\n") fp.write("</Norm>\n") if section_types: print(section_types) fp.write("</Section>" * (len(section_types))) fp.write("</div>)") # end global div
def generate_lawbook(name): ANNOTATIONS = ANNOTATIONS_MAP.get(name, Annotations(list())) with open(os.path.join(STATIC_DIR, "%s.html" % name), "w+", encoding="utf-8") as fp: fp.write("""<html> <head> <title> %s </title> <meta charset="utf-8"> <meta name="viewport" content="width=device-width, initial-scale=1"> <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.0/css/bootstrap.min.css"> <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.4.1/jquery.min.js"></script> <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.0/js/bootstrap.min.js"></script> <link href="css/gesetze.css" rel="stylesheet" title="Default Style"> </head> <body> """ % name) data = read_json(name) fp.write("<h1>%s</h1>" % name) for entry in data: if entry["type"] == "section": fp.write("<h3>%s</h3>" % entry["title"]) else: paragraph, title = entry["norm"], entry.get("title", "") if title is None: title = "" # print("Writing %s %s" % (paragraph, name)) anchor = "<a id='#%s'></a>" % entry["norm"] fp.write("<div class='norm'>") fp.write( "<div class='normhead%s'>%s %s</div> %s" % (" marked" if ANNOTATIONS.is_marked(paragraph) else "", paragraph, title, anchor)) fp.write("<div class='normtext'>") for absatz in entry["paragraphs"]: fp.write("<div class='abs'>%s" % (absatz["text"])) subs = absatz["sub"] if subs: fp.write("<div class='subbox'>") for i, sub in enumerate(subs): fp.write("<div class='sub'>%d. %s" % (i + 1, sub["text"])) subsubs = sub["sub"] if subsubs != []: fp.write("<div class='subsubbox'>") letters = lit_gen() for subsub in subsubs: fp.write( "<div class='subsub'>%s) %s</div>" % (next(letters), subsub["text"])) fp.write("</div>") # .subsubbox fp.write("</div>") # .sub fp.write("</div>") # .subbox fp.write("</div>") # .abs fp.write("</div>") # .normtext fp.write("</div>") # .norm fp.write("</body> </html>")