Ejemplo n.º 1
0
def tabfile_feeder(
    datafile,
    header=1,
    sep='\t',
    includefn=None,
    # coerce_unicode=True,   # no need here because importing unicode_literals at the top
    assert_column_no=None):
    '''a generator for each row in the file.'''

    with open_anyfile(datafile) as in_f:
        reader = csv.reader(in_f, delimiter=sep)
        lineno = 0
        try:
            for i in range(header):
                reader.next()
                lineno += 1

            for ld in reader:
                if assert_column_no:
                    if len(ld) != assert_column_no:
                        err = "Unexpected column number:" \
                              " got {}, should be {}".format(len(ld), assert_column_no)
                        raise ValueError(err)
                if not includefn or includefn(ld):
                    lineno += 1
                    # if coerce_unicode:
                    #     yield [unicode(x, encoding='utf-8', errors='replace') for x in ld]
                    # else:
                    #     yield ld
                    yield ld
        except ValueError:
            print("Error at line number:", lineno)
            raise
Ejemplo n.º 2
0
def get_genome_in_bit(chr_fa_folder):
    ''' encode each chromosome fasta sequence into a bitarray,
        and store them in a dictionary with chr numbers as keys
        chr_fa_folder is the folder to put all gzipped fasta files:

        fasta files can be downloaded from NCBI FTP site:

        ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37.p13/Primary_Assembly/assembled_chromosomes/FASTA/
        chr<i>.fa.gz  (e.g. chr1.fa.gz)

    '''
    chr_bit_d = {}
    chr_range = [str(i) for i in range(1, 23)] + ['X', 'Y', 'MT']
    t0 = time.time()
    for i in chr_range:
        t1 = time.time()
        #file_name = 'hs_ref_GRCh37.p5_chr{}.fa.gz'.format(i)
        file_name = 'chr{}.fa.gz'.format(i)
        print("Loading {}...".format(file_name), end='')
        file_name = os.path.join(chr_fa_folder, file_name)
        with open_anyfile(file_name) as seq_f:
            seq_f.readline()  # skip header
            seq_bit = bitarray()
            for line in seq_f:
                line = line.rstrip('\n')
                line_bit = nuc_to_bit(line)
                seq_bit += line_bit
            chr_bit_d.update({i: seq_bit})
        print("done.[{}]".format(timesofar(t1)))
    print('=' * 20)
    print("Finished. [{}]".format(timesofar(t0)))

    return chr_bit_d
Ejemplo n.º 3
0
def tabfile_feeder(datafile, header=1, sep='\t',
                   includefn=None,
                   # coerce_unicode=True,   # no need here because importing unicode_literals at the top
                   assert_column_no=None):
    '''a generator for each row in the file.'''

    with open_anyfile(datafile) as in_f:
        reader = csv.reader(in_f, delimiter=sep)
        lineno = 0
        try:
            for i in range(header):
                reader.next()
                lineno += 1

            for ld in reader:
                if assert_column_no:
                    if len(ld) != assert_column_no:
                        err = "Unexpected column number:" \
                              " got {}, should be {}".format(len(ld), assert_column_no)
                        raise ValueError(err)
                if not includefn or includefn(ld):
                    lineno += 1
                    # if coerce_unicode:
                    #     yield [unicode(x, encoding='utf-8', errors='replace') for x in ld]
                    # else:
                    #     yield ld
                    yield ld
        except ValueError:
            print("Error at line number:", lineno)
            raise
Ejemplo n.º 4
0
def get_genome_in_bit(chr_fa_folder):
    ''' encode each chromosome fasta sequence into a bitarray,
        and store them in a dictionary with chr numbers as keys
        chr_fa_folder is the folder to put all gzipped fasta files:

        fasta files can be downloaded from NCBI FTP site:

        ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37.p13/Primary_Assembly/assembled_chromosomes/FASTA/
        chr<i>.fa.gz  (e.g. chr1.fa.gz)

    '''
    chr_bit_d = {}
    chr_range = [str(i) for i in range(1, 23)] + ['X', 'Y', 'MT']
    t0 = time.time()
    for i in chr_range:
        t1 = time.time()
        #file_name = 'hs_ref_GRCh37.p5_chr{}.fa.gz'.format(i)
        file_name = 'chr{}.fa.gz'.format(i)
        print("Loading {}...".format(file_name), end='')
        file_name = os.path.join(chr_fa_folder, file_name)
        with open_anyfile(file_name) as seq_f:
            seq_f.readline()   # skip header
            seq_bit = bitarray()
            for line in seq_f:
                line = line.rstrip('\n')
                line_bit = nuc_to_bit(line)
                seq_bit += line_bit
            chr_bit_d.update({i: seq_bit})
        print("done.[{}]".format(timesofar(t1)))
    print('='*20)
    print("Finished. [{}]".format(timesofar(t0)))

    return chr_bit_d
Ejemplo n.º 5
0
def rec_handler(infile, block_end='\n', skip=0, include_block_end=False, as_list=False):
    '''A generator to return a record (block of text)
       at once from the infile. The record is separated by
       one or more empty lines by default.
       skip can be used to skip top n-th lines
       if include_block_end is True, the line matching block_end will also be returned.
       if as_list is True, return a list of lines in one record.
    '''
    rec_separator = lambda line: line == block_end
    with open_anyfile(infile) as in_f:
        if skip:
            for i in range(skip):
                in_f.readline()
        for key, group in itertools.groupby(in_f, rec_separator):
            if not key:
                if include_block_end:
                    _g = itertools.chain(group, (block_end,))
                yield (list(_g) if as_list else ''.join(_g))
Ejemplo n.º 6
0
def rec_handler(infile,
                block_end='\n',
                skip=0,
                include_block_end=False,
                as_list=False):
    '''A generator to return a record (block of text)
       at once from the infile. The record is separated by
       one or more empty lines by default.
       skip can be used to skip top n-th lines
       if include_block_end is True, the line matching block_end will also be returned.
       if as_list is True, return a list of lines in one record.
    '''
    rec_separator = lambda line: line == block_end
    with open_anyfile(infile) as in_f:
        if skip:
            for i in range(skip):
                in_f.readline()
        for key, group in itertools.groupby(in_f, rec_separator):
            if not key:
                if include_block_end:
                    _g = itertools.chain(group, (block_end, ))
                yield (list(_g) if as_list else ''.join(_g))