Ejemplo n.º 1
0
def _vcf_parseheader(fileobj):
    pop = Population()
    for line in fileobj:

        if line.startswith('##'):
            continue

        elif line.startswith('#'):
            ind_ids = line.strip().split()[9:]
            inds = [Individual(pop, ind_id) for ind_id in ind_ids]
            for ind in inds:
                pop.register_individual(ind)

            return pop, inds

        else:
            raise FileFormatError("No header line in VCF")
Ejemplo n.º 2
0
def read_vcf(filename, require_pass=False, freq_info=None, info_filters=None):
    '''
    Reads a VCF file and returns a Population object with the
    individuals represented in the file
    '''
    if not info_filters:
        info_filters = []

    for filter in info_filters:
        if not callable(filter):
            raise ValueError('Filter not callable')

    with open(filename) as f:
        pop = Population()

        last_chrom = None
        genotypes = []

        for i, line in enumerate(f):

            if line.startswith('##'):
                continue

            elif line.startswith('#'):
                ind_ids = line.strip().split()[9:]
                inds = [Individual(pop, ind_id) for ind_id in ind_ids]
                for ind in inds:
                    pop.register_individual(ind)

                break
        
        for i, line in enumerate(f):
            record = VCFRecord(line)

            if info_filters and not all(filter(record) for filter in info_filters):
                continue

            if require_pass and not record.filter_passed:
                continue

            if record.chrom != last_chrom:
                if last_chrom is not None:
                    chromobj.finalize()
                    pop.add_chromosome(chromobj)
                chromobj = ChromosomeTemplate(label=record.chrom)


            if freq_info is not None and freq_info in record.info:
                freq = record.info[freq_info]
                if ',' in freq:
                    freq = freq.split(',')[0]
                freq = float(freq)
            else:
                freq = 0

            genorow = record.genotypes()
            genotypes.append(genorow)

            chromobj.add_genotype(bp=record.pos,
                                  label=record.label,
                                  frequency=freq)

            last_chrom = record.chrom

        chromobj.finalize()
        pop.add_chromosome(chromobj)

    for ind in inds:
        # Initialize new genotypes
        ind._init_genotypes(sparse=True)

    # Now actually sift through markers and assign them to individuals
    final_indices = []
    for chromidx, chromobj  in enumerate(pop.chromosomes):
        indices = zip([chromidx]*chromobj.nmark(), range(chromobj.nmark()))
        final_indices.extend(indices)

    raw_indices = range(len(genotypes))

    for raw, final in zip(raw_indices, final_indices):
        chromidx, markidx = final
        row = genotypes[raw]
        assign_genorow(row, inds, chromidx, markidx)

        # Kill the row so we don't end up with the whole dataset in memory twice
        genotypes[raw] = None
    
    return pop