Ejemplo n.º 1
0
def TpedIterator(handle):
    """
    Iterates on an TPed file handler.
    Returns Marker objects.
    
    >>> import tempfile
    >>> sample_Tped_file = tempfile.TemporaryFile()
    >>> sample_Tped_file.write('''
    ... # Chromosome Marker_Id ??? position locus1_allele1 locus1_allele2 locus2_allele1
    ... 4 rs10000543 0 30979886 C C T C 
    ... 4 rs10000929 0 131516474 A A A A
    ... 4 rs10002472 0 159087423 A G G G
    ... ''')
    >>> sample_Tped_file.seek(0)
    >>> ti = TpedIterator(sample_Tped_file)
    >>> for marker in ti:
    ...     print marker
    Marker rs10000543, 2 individuals
    Marker rs10000929, 2 individuals
    Marker rs10002472, 2 individuals
    """
    for line in handle:
#        if line.strip() == "":
#            debug('premature end of file?')
#            return
        if line.startswith('#'):
            comment = line
        elif line.strip() == '':
            # empy line, ignore
            pass
        else:            
            tped_fields = line.split()
            if len(tped_fields) < 4:
                raise InvalidInputFile('line too short - check input file\n("%s")' % line.strip())
            # should check that current line has the same length than the previous
            # should check that characters after [4] are rigth
            
            chromosome = tped_fields[0]
            marker_name = tped_fields[1]
            unknown = tped_fields[2]
            position = tped_fields[3]
            current_marker = Marker(marker_name)
            current_marker.position = "%s - chrom %s" % (position, chromosome)
            logging.debug(current_marker)

            genotypes = [(tped_fields[i], tped_fields[i+1]) for i in xrange(4, len(tped_fields), 2)]
            logging.debug(genotypes)
            current_marker.genotypes = genotypes
            
            yield current_marker
Ejemplo n.º 2
0
def hgdpgenotypesParser(handle, individuals_filter = None, markers_filter = None):
    """
    Parse a genotypes file handler.
    
    It returns a Marker object for every line of the file
    
    >>> from StringIO import StringIO
    >>> genotypes_file = StringIO(
    ... '''  HGDP00001    HGDP00002    HGDP00003    HGDP00004    HGDP00005    HGDP00006    HGDP00007    HGDP00008    HGDP00009    HGDP000010
    ... rs1112390    AA    GG    AG    AA    AA    AA    AA    AA    AA    AA   
    ... rs1112391    TT    TC    CC    CC    CC    CC    CC    CC    CC    CC
    ... MitoA11252G    AA    AA    AA    AA    AA    AA    AA    AA    AA    AA
    ... rs11124185    TC    TT    TT    TT    TT    TT    TT    TT    TT    TT
    ... MitoA13265G    AA    AA    AA    AA    AA    AA    AA    AA    AA    AA
    ... MitoA13264G    GG    AA    AA    AA    GG    AG    AA    AA    AA    AA
    ... MitoA13781G    AA    AA    AA    AA    AA    AA    --    AA    AA    AA
    ... MitoA14234G    AA    AA    AA    AA    AA    AA    AA    AA    AA    AA
    ... MitoA14583G    AA    AA    AA    AA    AA    AA    AA    AA    AA    AA
    ... MitoA14906G    GG    GG    GG    GG    GG    GG    GG    GG    GG    GG
    ... MitoA15219G    AA    AA    AA    GG    AA    AA    AA    AA    AA    AA''')
    
    >>> individuals_filter = ['HGDP00001', 'HGDP00004', ]  
    >>> markers = hgdpgenotypesParser(genotypes_file, individuals_filter)
    >>> print '\t' + '\t'.join(markers[0].individuals)
     HGDP00001   HGDP00004
    >>> for marker in markers:
    ...    print marker.to_geno_format()    #doctest: +NORMALIZE_WHITESPACE
    rs1112390    AA    AA    
    rs1112391    TT    CC    
    MitoA11252G    AA    AA    
    rs11124185    TC    TT    
    MitoA13265G    AA    AA    
    MitoA13264G    GG    AA    
    MitoA13781G    AA    AA    
    MitoA14234G    AA    AA    
    MitoA14583G    AA    AA    
    MitoA14906G    GG    GG    
    MitoA15219G    AA    GG    

    """
    # initialize output var
    markers = []
    
    # read the header, containing the Individuals names
#    handle.readline()       # first line is empty??
    header = handle.readline()
    if header is None:
        raise ValueError('Empty file!!')
    individuals = [Individual(ind_id) for ind_id in header.split()]
    if individuals_filter is None:      # TODO: ugly 
        individuals_filter = [ind.individual_id for ind in individuals]
    
    columns_to_filter = []
    for ind in header.split():
        if ind in individuals_filter:
            columns_to_filter.append(1)
        else:
            columns_to_filter.append(0)
    
    # Read the remaining lines of genotypes file, containin genotypes info.
    for line in handle.readlines():
        fields = line.split()   # TODO: add more rigorous conditions
        if fields is None:
            break
        # Initialize a Genotype object 
        marker = Marker(name = fields[0], individuals = individuals_filter)
        markers.append(marker)
        
        for n in range(1, len(fields)):
            current_individual = individuals[n-1]
            if current_individual in individuals_filter:    #TODO: this consumes CPU time
#            if columns_to_filter[0] == 1:
                current_genotype = fields[n]
                marker.add_genotype(current_genotype)
#                print current_individual
            else:
                pass
            
    return markers