def test_run(self): """infer header types with NA's in every line""" # NA's in every line -- infer types. n_lines = 0 header_list = ["str", "float", "int", "str"] fname = "tmp_normal.txt" # Put an NA in the first line -- test the ability to look past to infer types first_lines = [ randString() + "\t59.3\tNA\t" + randString() + "\n", randString() + "\tNA\t12\t" + randString() + "\n" ] makeFile(fname, None, "sfds", n_lines, '\t', 0.1, first_lines=first_lines, last_lines=None) inf = open(fname, 'r') # Infer the types fp = util.DelimitedLineReader(inf, header=False) header = fp.getHeader() vals = [] while not fp.atEnd(): #print fp.cache.cache[-1], v = fp.next() #print v if not v[2] is None: vals.append(v[2]) res = sum(vals) == 12 inf.close() os.remove(fname)
def readData(self, orf_dict): inf = file(os.path.expanduser(self.filename),'r') dlr = util.DelimitedLineReader(inf, strip=False, header_name_processor=util.maxQuantHeader) # Read in the data max_lines = 1e9 line = 0 while not dlr.atEnd() and line < max_lines: line += 1 flds = dlr.nextDict() self.parseFields(flds, orf_dict) if line == max_lines: print "# Warning: max_lines ({0}) exceeded in ExperimentEvidence.readData()".format(max_lines)
def __init__(self, infile, attribute_parser=parseAttributesDefault): self._infile = infile self._attr_parser = attribute_parser if isinstance(infile, str): infile_name = os.path.expanduser(infile) if not os.path.isfile(infile_name): raise BioFileError( "Cannot find the FASTA file {}.".format(infile_name)) else: self._infile = open(infile_name, 'r') self._dlr = util.DelimitedLineReader(self._infile, header=False, sep='\t')
def test_run(self): """read through""" # Normal n_lines = 100 header_list = ["str", "float", "int", "str"] fname = "tmp_normal.txt" makeFile(fname, header_list, "sfds", n_lines, '\t', 0.0) inf = open(fname, 'r') # Infer the types fp = util.DelimitedLineReader(inf) while not fp.atEnd(): fp.next() inf.close() os.remove(fname)
def test_run(self): """header parsing""" # Normal n_lines = 100 header_list = ["str", "float", "int", "anotherStr"] fname = "tmp_normal.txt" makeFile(fname, header_list, "sfds", n_lines, '\t', 0.0) inf = open(fname, 'r') # Infer the types fp = util.DelimitedLineReader(inf) header = fp.getHeader() res = header == header_list inf.close() os.remove(fname) return res
def test_run(self): """no headers but call to nextDict""" # No headers but call to nextDict n_lines = 10 fname = "tmp_no_header.txt" field_types = "sfds" makeFile(fname, None, field_types, n_lines, '\t', 0.0) inf = open(fname, 'r') # Infer the types fp = util.DelimitedLineReader(inf, header=False) while not fp.atEnd(): flds = fp.nextDict() self.assertTrue(set(flds.keys()) == set(range(len(field_types)))) inf.close() os.remove(fname)
def test_run(self): """header processing: custom header processor""" # Header processing n_lines = 100 header_list = ["Ratio (H/L)", "float", "int", "This + That"] fname = "tmp_normal.txt" makeFile(fname, header_list, "ffds", n_lines, '\t', 0.1) inf = open(fname, 'r') # Infer the types fp = util.DelimitedLineReader( inf, header=True, header_name_processor=util.maxQuantHeader) header = fp.getHeader() self.assertTrue(header[0] == 'ratio.hl') self.assertTrue(header[-1] == 'this.plus.that') inf.close() os.remove(fname)
def test_run(self): """comment as last line""" # Comment as last line n_lines = 10 header_list = ["str", "float", "int", "str"] fname = "tmp_comment_last.txt" makeFile(fname, header_list, "sfds", n_lines, '\t', 0.0) inf = open(fname, 'a') inf.write("# comment\n") inf.close() inf = open(fname, 'r') # Infer the types fp = util.DelimitedLineReader(inf) while not fp.atEnd(): flds = fp.nextDict() inf.close() os.remove(fname)
def test_run(self): """header processing""" # Header processing n_lines = 100 header_list = ["int", "int.1", "int", "int"] fname = "tmp_normal.txt" makeFile(fname, header_list, "ffds", n_lines, '\t', 0.1) inf = open(fname, 'r') # Infer the types fp = util.DelimitedLineReader(inf) header = fp.getHeader() self.assertTrue(header[0] == 'int') self.assertTrue(header[1] == 'int.1') self.assertTrue(header[2] == 'int.2') self.assertTrue(header[3] == 'int.3') inf.close() os.remove(fname)
def test_run(self): """lines read""" # Normal n_lines = random.randint(10, 300) header_list = ["str", "float", "int", "str"] fname = "tmp_normal.txt" makeFile(fname, None, "sfds", n_lines, '\t', 0.0) inf = open(fname, 'r') # Infer the types fp = util.DelimitedLineReader(inf) header = fp.getHeader() while not fp.atEnd(): v = fp.next() res = fp.getNumRead() == n_lines inf.close() os.remove(fname) return res
def test_run(self): """skip, off by one""" fname = "tmp_skip.txt" inf = open(fname, 'w') inf.write("blah\nblah\nblah\n") inf.write("one\ttwo\tthree\n") inf.write("a\tb\t3\n") inf.write("a\tb\t33\n") inf.close() with open(fname, 'r') as inf: dlr = util.DelimitedLineReader(inf, header=True, skip=2) for (ri, flds) in enumerate(dlr.dictentries): try: flds['three'] except KeyError: self.assertTrue(True) inf.close() os.remove(fname)
def test_run(self): """set header names""" fname = "tmp_setheader.txt" inf = open(fname, 'w') inf.write("one\ttwo\tthree\n") inf.write("a\tb\t3\n") inf.write("a\tb\t33\n") inf.close() with open(fname, 'r') as inf: dlr = util.DelimitedLineReader(inf, header=True) dlr.setHeaderNames(['a', 'b', 'c']) for (ri, flds) in enumerate(dlr.dictentries): if ri == 0: self.assertTrue(flds['c'] == 3) if ri == 1: self.assertTrue(flds['c'] == 33) inf.close() os.remove(fname)
def test_run(self): """skip, positive case""" fname = "tmp_skip.txt" inf = open(fname, 'w') inf.write("blah\nblah\nblah\n") inf.write("one\ttwo\tthree\n") inf.write("a\tb\t3\n") inf.write("a\tb\t33\n") inf.close() with open(fname, 'r') as inf: dlr = util.DelimitedLineReader(inf, header=True, skip=3) for (ri, flds) in enumerate(dlr.dictentries): if ri == 0: self.assertTrue(flds['three'] == 3) if ri == 1: self.assertTrue(flds['three'] == 33) inf.close() os.remove(fname)
def test_run(self): """set handler type""" fname = "tmp_types.txt" inf = open(fname, 'w') inf.write("one\ttwo\tthree\n") #inf.write(" NA NA NA\n") inf.write(" 1.0 two 3\n") inf.close() inf = open(fname, 'r') # Assume all types are string fp = util.DelimitedLineReader(inf, header=True, field_defs='sss', sep=None) fp.setColumnType("three", "d") flds = fp.nextDict() self.assertTrue(flds['three'] == 3) inf.close() os.remove(fname)
def test_run(self): """iterator over entries""" n_lines = 10 header_list = ["str", "float", "int", "str"] fname = "tmp_comment_first.txt" inf = open(fname, 'w') inf.write("one\ttwo\tthree\n") inf.write(" a b c\n") inf.write(" a b c\n") inf.close() inf = open(fname, 'r') # Infer the types fp = util.DelimitedLineReader(inf, header=True, sep=None) for flds in fp.entries: self.assertTrue(flds[0] == 'a') self.assertTrue(flds[1] == 'b') self.assertTrue(flds[2] == 'c') inf.close() os.remove(fname)
def test_run(self): """infer header types 2""" # Normal n_lines = 100 header_list = ["str", "float", "int", "str"] fname = "tmp_normal.txt" makeFile(fname, None, "sfds", n_lines, '\t', 0.0) inf = open(fname, 'r') # Infer the types fp = util.DelimitedLineReader(inf) header = fp.getHeader() vals = [] while not fp.atEnd(): #print fp.cache.cache[-1], v = fp.next() #print fp.isValid(), v vals.append(v[2]) self.assertTrue(sum(vals) > n_lines) inf.close() os.remove(fname)
def test_run(self): """comments as first lines""" n_lines = 10 header_list = ["str", "float", "int", "str"] fname = "tmp_comment_first.txt" makeFile(fname, header_list, "sfds", n_lines, '\t', 0.0, first_lines="# first line comment\n# second line comment") inf = open(fname, 'r') # Infer the types fp = util.DelimitedLineReader(inf) while not fp.atEnd(): flds = fp.nextDict() # if we make it this far, we did not throw an error. self.assertTrue(True) inf.close() os.remove(fname)
def test_run(self): """multiple delimiters -- spaces -- with sep=None""" n_lines = 10 header_list = ["str", "float", "int", "str"] fname = "tmp_comment_first.txt" inf = open(fname, 'w') inf.write("one\ttwo\tthree\n") inf.write(" a b c\n") inf.write(" a b c\n") inf.close() inf = open(fname, 'r') # Infer the types fp = util.DelimitedLineReader(inf, header=True, sep=None) while not fp.atEnd(): flds = fp.nextDict() self.assertTrue(flds['one'] == 'a') self.assertTrue(flds['two'] == 'b') self.assertTrue(flds['three'] == 'c') self.assertFalse(flds['three'] == 'b') inf.close() os.remove(fname)
def load(self, evidence_descs, filter_tags, filter_experiments, unique_matches, tracked_modifications, orf_dict): # evidence_descs is a list of EvidenceDescriptor variables # Read experiments # Filter based on tags self.experiments = [] evidence_fnames = {} tag_set = set(filter_tags) for ed in evidence_descs: # If no filter specified, or our experiment is in the filter, analyze it. if filter_experiments is None or filter_experiments == [] or ed.experiment in filter_experiments: shared_tags = list(tag_set.intersection(set(ed.tags))) if len(filter_tags) == 0 or len(shared_tags) > 0: # This experiment should be included exev = ExperimentEvidence(tracked_modifications, unique_matches) exev.initFrom(ed) self.experiments.append(exev) try: evidence_fnames[ed.filename].append(exev) except KeyError: evidence_fnames[ed.filename] = [exev] for fname in evidence_fnames.keys(): exp_fname = os.path.expanduser(fname) assert os.path.isfile(exp_fname), "# No file found for {}\n".format(exp_fname) inf = file(exp_fname,'r') dlr = util.DelimitedLineReader(inf, strip=False, header_name_processor=util.maxQuantHeader) #print dlr.headers # Read in the data max_lines = 1e8 line = 0 while not dlr.atEnd() and line < max_lines: line += 1 flds = dlr.nextDict() # Let all the experiments try to parse this line relevant_experiments = evidence_fnames[exp_fname] for ex in relevant_experiments: res = ex.parseFields(flds, orf_dict) inf.close() return self.experiments
def test_run(self): """infer header types with NA's in one full column""" # NA's in every line -- infer types. n_lines = 0 header_list = ["str", "float", "int", "str"] fname = "tmp_normal.txt" # Put an NA in the first line -- test the ability to look past to infer types first_lines = [randString() + "\t59.3\tNA\t" + randString() + "\n" ] * 100 makeFile(fname, None, "sfds", n_lines, '\t', 0.1, first_lines=first_lines, last_lines=None) inf = open(fname, 'r') # Infer the types fp = util.DelimitedLineReader(inf, header=False) header = fp.getHeader() inf.close() os.remove(fname)
def test_run(self): """adaptive handler updating""" # Adaptive field redefinition n_lines = 100 header_list = ["float", "float", "int", "str"] fname = "tmp_normal.txt" last_lines = ["0.2\t0.3\tnot.an.int\twakka\n"] makeFile(fname, header_list, "ffds", n_lines, '\t', 0.01, last_lines=last_lines) inf = open(fname, 'r') # Infer the types fp = util.DelimitedLineReader( inf, header=True, header_name_processor=util.maxQuantHeader) header = fp.getHeader() while not fp.atEnd(): flds = fp.next() inf.close() os.remove(fname)
optdict = vars(options) for (k,v) in optdict.items(): data_outs.write("#\t{k}: {v}\n".format(k=k, v=v)) # Read densities # Density files should be specified with a regexp pattern matching only the length of the window. assert '*' in options.in_distribution_fname_pattern entropies = {} densities = {} for winsize in range(options.min_window_size, options.max_window_size,1): fname = options.in_distribution_fname_pattern.replace('*', '{:d}'.format(winsize)) #if not os.path.isfile(fname): # raise IOError("# Error: file {} does not exist".format(fname)) try: inf = file(fname, 'r') dlr = util.DelimitedLineReader(inf, header=True) dens = [] ent = [] for flds in dlr.dictentries: bin_upper = flds['ent.upper'] p_entropy_lower = flds['cum.density'] ent.append(bin_upper) dens.append(p_entropy_lower) densities[winsize] = dens entropies[winsize] = ent except IOError: info_outs.write("# File {} not found\n".format(fname)) continue except util.ReaderEOFError: info_outs.write("# EOF encountered in file {}\n".format(fname)) continue
if not options.query_orf is []: # Specific ORF(s) query_keys += options.query_orf if not options.query_gene is []: # Specific gene(s) query_keys += [gene_orf_dict[k] for k in options.query_gene ] # if k.startswith(options.query_gene)]) if len(query_keys) == 0: # Go through all proteins in database query_keys = all_keys # Load domain/color boundaries if not os.path.isfile(options.in_domain_fname): raise IOError("# Error: file {} does not exist".format( options.in_domain_fname)) dlr = util.DelimitedLineReader(file(options.in_domain_fname, 'r'), header=True) domain_boundaries = util.listdict() domain_colors = util.listdict() for flds in dlr.dictentries: orf = flds['orf'] score = flds['p.value'] #if orf in query_keys: # print orf, score, flds['start'], flds['end'], flds['sequence'] if score < options.score_threshold: domain_boundaries[orf].append((flds['start'], flds['end'])) domain_colors[orf].append(flds['color'].replace("0x", "#")) # Remove gaps? if options.degap: for k in query_keys: prot_dict[k] = prot_dict[k].replace("-", '')