Ejemplo n.º 1
0
 def _parse_table(self, filehandle, delimiter="\t"):
     log(f"Reading {file_str(filehandle)} as tab-delimited file ...")
     rows = [r.split(delimiter) for r in filehandle.readlines()]
     self.header = [c.strip() for c in rows[0]]
     indices = range(len(self.header))
     rows = rows[1:]
     columns = {
         self.header[i]: [strOrNone(r[i].strip()) for r in rows] for i in indices
     }
     return columns
Ejemplo n.º 2
0
def headlessTabularTyping(data, levels=None, na_str=[None]):
    cols = []
    for (i, xs) in enumerate(data):
        hl = HomoList(xs, na_str=na_str).data
        log(f" - 'X{i}':{colors.good(hl[0].typename)}")
        cols.append(hl)
    phrases = [
        Phrase([col[i] for col in cols], levels=levels) for i in range(len(cols[0]))
    ]
    return phrases
Ejemplo n.º 3
0
 def _parse_excel(self, filehandle):
     try:
         log(f"Reading {file_str(filehandle)} as excel file ...")
         d = pd.read_excel(filehandle.name)
         self.header = list(d.columns)
         # create a dictionary of List(str) with column names as keys
         return {c: [strOrNone(x) for x in d[c]] for c in d}
     except xlrd.biffh.XLRDError as e:
         log(f"Could not parse '{file_str(filehandle)}' as an excel file")
         raise e
     return d
Ejemplo n.º 4
0
def tabularTyping(data, levels=None, na_str=[None]):
    cols = []
    for k, v in data.items():
        hl = HomoList(v, field_name=k, na_str=na_str).data
        if len(hl) > 0:
            log(f" - '{k}':{colors.good(hl[0].typename)}")
        else:
            log(f"{colors.bad('Warning:')} no data")
        cols.append(hl)
    phrases = [
        Phrase([col[i] for col in cols], levels=levels) for i in range(len(cols[0]))
    ]
    return phrases
Ejemplo n.º 5
0
 def cast(self, data):
     if data == "":
         return Missing(data, na_str=self.na_str)
     for classifier in self.classifiers:
         try:
             token = classifier(data, field_name=self.field_name, na_str=self.na_str)
         except TypeError:
             log(data)
             log(token)
             sys.exit(1)
         if token:
             return token
     return self.default_classifier(
         data, field_name=self.field_name, na_str=self.na_str
     )
Ejemplo n.º 6
0
 def log(self):
     log("Parsing with the following tokens:")
     for classifier in self.classifiers:
         log(f"  {colors.good(classifier.typename)}")
     if self.tag:
         log(f"Tagging as '{self.tag}'")
     else:
         log(f"{colors.bad('No tag given')}")
Ejemplo n.º 7
0
 def relate(self, tokens, g, levels=None):
     uri = self.as_uri()
     g.add((uri, P.proseq, Literal(self.clean)))
     has_segment = self._has_segment(tokens)
     for other in tokens:
         if other.clean is None:
             continue
         if other.group == "segment":
             g.add((other.as_uri(), P.has_feature, uri))
         elif other.group == "strain":
             if has_segment:
                 log("WARNING: I don't know how to connect a protein to a strain id"
                     )
         elif not other.choose_field_name(
         ) in STRAIN_FIELDS and not has_segment:
             other.object_of(g, uri)
Ejemplo n.º 8
0
 def cast(self, data):
     # If all entries have the same number of entries, I treat them as a
     # table. Then I can use column-based type inference.
     if len({len(xs) for xs in data}) == 1:
         N = len(data[0])
         log(f"Applying column type inference (all headers have {N-1} fields)")
         tabular_data = [[row[i] for row in data] for i in range(N)]
         return headlessTabularTyping(
             tabular_data, levels=self.levels, na_str=self.na_str
         )
     else:
         return [
             Phrase(
                 [Datum(x, na_str=self.na_str).data for x in row], levels=self.levels
             )
             for row in data
         ]
Ejemplo n.º 9
0
 def _parse_fasta(self, filehandle, sep="|"):
     """
     Parse a fasta file. The header is split into fields on 'sep'. The
     sequence is added as a final field.
     """
     p_header = parsec.string(">") >> parsec.regex("[^\n\r]*") << parsec.spaces()
     p_seq = (
         parsec.sepBy1(
             parsec.regex("[^>\n\r]*"), sep=parsec.regex("[\r\n\t ]+")
         ).parsecmap(concat)
         << parsec.spaces()
     )
     p_entry = p_header + p_seq
     p_fasta = parsec.many1(p_entry)
     log(f"Reading {file_str(filehandle)} as a fasta file:")
     try:
         entries = p_fasta.parse(filehandle.read())
     except AttributeError:
         # in case I want to pass in a list of strings, e.g., in tests
         entries = p_fasta.parse(filehandle)
     row = [h.split(sep) + [q] for (h, q) in entries]
     return row
Ejemplo n.º 10
0
 def summarize(self):
     log(f"typename: {self.data.typename}")
     log(f"field_name: {self.data.field_name}")
     log(f"value: {self.data.dirty}")
     log(f"munged: {self.data.clean}")
Ejemplo n.º 11
0
 def connect(self, g):
     log("Making triples")
     taguri = addTag(g, tag=self.tag, filehandle=self.filehandle)
     for (i, phrase) in enumerate(tqdm(self.data)):
         phrase.connect(g, taguri=taguri)