Esempio n. 1
0
def iterate_chunks(fw):
    cmtreg = re.compile(r'>[^>]*?\n')
    headpos = 0
    while True:
        chunk = fw.read(2**20)
        if not chunk:
            break
        for mat in cmtreg.finditer(chunk):
            if mat.start() != headpos:
                seq = chunk[headpos:mat.start()]
                yield remove_whitespaces(seq)
            yield mat.group().strip()
            headpos = mat.end()
        seq = chunk[headpos:]
        yield remove_whitespaces(seq)
Esempio n. 2
0
def read(infile):
    try:
        fw = streaming.FileWrapper(infile, "r")
    except IOError:
        respath = locate_submat(infile.lower())
        fw = streaming.FileWrapper(respath, "r")

    with fw:
        fw_lines = (l.strip() for l in fw.file)
        ichars = []
        jstring = None
        jsize = 0
        scores = []

        for line in fw_lines:
            if line and not line.startswith("#"):
                jstring = remove_whitespaces(line)
                jsize = len(jstring) + 1
                break

        if jstring is None:
            raise ValueError("this sub matrix file is broken")

        for line in fw.file:
            items = line.split()
            if not items:
                continue

            if len(items) != jsize:
                raise ValueError("this sub matrix file is broken")
            ichars.append(items[0])
            scores.append([int(s) for s in items[1:]])

    istring = "".join(ichars)
    submatr = np.array(scores, dtype=int)
    return istring, jstring, submatr