def _parse_table(self, filehandle, delimiter="\t"): log(f"Reading {file_str(filehandle)} as tab-delimited file ...") rows = [r.split(delimiter) for r in filehandle.readlines()] self.header = [c.strip() for c in rows[0]] indices = range(len(self.header)) rows = rows[1:] columns = { self.header[i]: [strOrNone(r[i].strip()) for r in rows] for i in indices } return columns
def headlessTabularTyping(data, levels=None, na_str=[None]): cols = [] for (i, xs) in enumerate(data): hl = HomoList(xs, na_str=na_str).data log(f" - 'X{i}':{colors.good(hl[0].typename)}") cols.append(hl) phrases = [ Phrase([col[i] for col in cols], levels=levels) for i in range(len(cols[0])) ] return phrases
def _parse_excel(self, filehandle): try: log(f"Reading {file_str(filehandle)} as excel file ...") d = pd.read_excel(filehandle.name) self.header = list(d.columns) # create a dictionary of List(str) with column names as keys return {c: [strOrNone(x) for x in d[c]] for c in d} except xlrd.biffh.XLRDError as e: log(f"Could not parse '{file_str(filehandle)}' as an excel file") raise e return d
def tabularTyping(data, levels=None, na_str=[None]): cols = [] for k, v in data.items(): hl = HomoList(v, field_name=k, na_str=na_str).data if len(hl) > 0: log(f" - '{k}':{colors.good(hl[0].typename)}") else: log(f"{colors.bad('Warning:')} no data") cols.append(hl) phrases = [ Phrase([col[i] for col in cols], levels=levels) for i in range(len(cols[0])) ] return phrases
def cast(self, data): if data == "": return Missing(data, na_str=self.na_str) for classifier in self.classifiers: try: token = classifier(data, field_name=self.field_name, na_str=self.na_str) except TypeError: log(data) log(token) sys.exit(1) if token: return token return self.default_classifier( data, field_name=self.field_name, na_str=self.na_str )
def log(self): log("Parsing with the following tokens:") for classifier in self.classifiers: log(f" {colors.good(classifier.typename)}") if self.tag: log(f"Tagging as '{self.tag}'") else: log(f"{colors.bad('No tag given')}")
def relate(self, tokens, g, levels=None): uri = self.as_uri() g.add((uri, P.proseq, Literal(self.clean))) has_segment = self._has_segment(tokens) for other in tokens: if other.clean is None: continue if other.group == "segment": g.add((other.as_uri(), P.has_feature, uri)) elif other.group == "strain": if has_segment: log("WARNING: I don't know how to connect a protein to a strain id" ) elif not other.choose_field_name( ) in STRAIN_FIELDS and not has_segment: other.object_of(g, uri)
def cast(self, data): # If all entries have the same number of entries, I treat them as a # table. Then I can use column-based type inference. if len({len(xs) for xs in data}) == 1: N = len(data[0]) log(f"Applying column type inference (all headers have {N-1} fields)") tabular_data = [[row[i] for row in data] for i in range(N)] return headlessTabularTyping( tabular_data, levels=self.levels, na_str=self.na_str ) else: return [ Phrase( [Datum(x, na_str=self.na_str).data for x in row], levels=self.levels ) for row in data ]
def _parse_fasta(self, filehandle, sep="|"): """ Parse a fasta file. The header is split into fields on 'sep'. The sequence is added as a final field. """ p_header = parsec.string(">") >> parsec.regex("[^\n\r]*") << parsec.spaces() p_seq = ( parsec.sepBy1( parsec.regex("[^>\n\r]*"), sep=parsec.regex("[\r\n\t ]+") ).parsecmap(concat) << parsec.spaces() ) p_entry = p_header + p_seq p_fasta = parsec.many1(p_entry) log(f"Reading {file_str(filehandle)} as a fasta file:") try: entries = p_fasta.parse(filehandle.read()) except AttributeError: # in case I want to pass in a list of strings, e.g., in tests entries = p_fasta.parse(filehandle) row = [h.split(sep) + [q] for (h, q) in entries] return row
def summarize(self): log(f"typename: {self.data.typename}") log(f"field_name: {self.data.field_name}") log(f"value: {self.data.dirty}") log(f"munged: {self.data.clean}")
def connect(self, g): log("Making triples") taguri = addTag(g, tag=self.tag, filehandle=self.filehandle) for (i, phrase) in enumerate(tqdm(self.data)): phrase.connect(g, taguri=taguri)