Example #1
0
File: vcf.py Project: pomo-dev/PoMo
def open_seq(VCFFileName, maxskip=100, name=None):
    """Open a VCF4.1 file.

    Try to open the given VCF file, checks if it is in VCF format and
    reads the bases(s).  It returns an :class:`VCFSeq` object that
    contains all the information.

    :param str VCFFileName: Name of the VCF file.
    :param int maxskip: Only look *maxskip* lines for the start of the
                        bases (defaults to 80).
    :param str name: Set the name of the sequence to *name*, otherwise
                     set it to the filename.

    """
    def test_sequence(seq):
        """ Test a given VCF sequence.

        TODO: implement this.

        :param seq:
        :returns:
        :rtype:

        """
        pass

    seq = VCFSeq()
    seq.header = ""

    flag = False
    VCFFile = sb.gz_open(VCFFileName)
    # set the vcf sequence name
    if name is not None:
        seq.name = name
    else:
        seq.name = sb.stripFName(VCFFileName)
    # Find the start of the first base
    for i in range(0, maxskip):
        line = VCFFile.readline()
        if line == '':
            raise NotAVariantCallFormatFileError("File contains no data.")
        if line[0:2] == '##':
            seq.header += line
        if line[0:6] == '#CHROM':
            # Here starts the data.
            check_fixed_field_header(line)
            seq.speciesL = get_indiv_from_field_header(line)
            seq.nSpecies = len(seq.speciesL)
            flag = True
            break
    if flag is False:
        raise NotAVariantCallFormatFileError(
            "Didn't find any data within " + str(maxskip) + " lines.")
    for line in VCFFile:
        base = get_nuc_base_from_line(line)
        seq.append_nuc_base(base)

    VCFFile.close()
    test_sequence(seq)
    return seq
Example #2
0
    def __init__(self, faFileName, maxskip=50, name=None):
        """Open a fasta file and initialize :class:`MFaStream`."""
        def add_instance_variables(name, firstSeqL, nextHL, faFileObject):
            """Add state objects."""
            self.name = name
            self.seqL = firstSeqL
            self.nSpecies = len(self.seqL)
            self.nextHeaderLine = nextHL
            self.fo = faFileObject

        flag = False
        faFile = sb.gz_open(faFileName)
        if name is None:
            name = sb.stripFName(faFileName)
        # Find the start of the first sequence.
        for i in range(0, maxskip):
            line = faFile.readline()
            if line == '':
                raise NotAFastaFileError("File contains no data.")
            if line[0] == '>':
                # species name found in line
                flag = True
                break
        if flag is False:
            raise NotAFastaFileError("Didn't find a species header within " +
                                     maxskip + " lines.")
        (nextHL, seqL) = read_align_from_fo(line, faFile)
        try:
            nextHL = nextHL.rstrip()
        except:
            pass
        add_instance_variables(name, seqL, nextHL, faFile)
Example #3
0
def open_seq(VCFFileName, maxskip=100, name=None):
    """Open a VCF4.1 file.

    Try to open the given VCF file, checks if it is in VCF format and
    reads the bases(s).  It returns an :class:`VCFSeq` object that
    contains all the information.

    :param str VCFFileName: Name of the VCF file.
    :param int maxskip: Only look *maxskip* lines for the start of the
                        bases (defaults to 80).
    :param str name: Set the name of the sequence to *name*, otherwise
                     set it to the filename.

    """
    def test_sequence(seq):
        """ Test a given VCF sequence.

        TODO: implement this.

        :param seq:
        :returns:
        :rtype:

        """
        pass

    seq = VCFSeq()
    seq.header = ""

    flag = False
    VCFFile = sb.gz_open(VCFFileName)
    # set the vcf sequence name
    if name is not None:
        seq.name = name
    else:
        seq.name = sb.stripFName(VCFFileName)
    # Find the start of the first base
    for i in range(0, maxskip):
        line = VCFFile.readline()
        if line == '':
            raise NotAVariantCallFormatFileError("File contains no data.")
        if line[0:2] == '##':
            seq.header += line
        if line[0:6] == '#CHROM':
            # Here starts the data.
            check_fixed_field_header(line)
            seq.speciesL = get_indiv_from_field_header(line)
            seq.nSpecies = len(seq.speciesL)
            flag = True
            break
    if flag is False:
        raise NotAVariantCallFormatFileError("Didn't find any data within " +
                                             str(maxskip) + " lines.")
    for line in VCFFile:
        base = get_nuc_base_from_line(line)
        seq.append_nuc_base(base)

    VCFFile.close()
    test_sequence(seq)
    return seq
Example #4
0
def open_seq(faFileName, maxskip=50, name=None):
    """Open and read a fasta file.

    This function tries to open the given fasta file, checks if it is
    in fasta format and reads the sequence(s).  It returns an
    :class:`FaSeq` object that contains a list of species names, a
    list of the respective desriptions and a list with the sequences.

    :param str faFileName: Name of the fasta file.
    :param int maxskip: Only look *maxskip* lines for the start of a sequence
                        (defaults to 50).
    :param str name: Set the name of the sequence to *name* otherwise
                     set it to the stripped filename.

    """
    def test_sequence(faSequence):
        """Tests if sequences contain data."""
        l = faSequence.nSpecies
        names = []
        for i in range(l):
            names.append(faSequence.seqL[i].name)
            if faSequence.seqL[i].name == '' or faSequence.seqL[i].data == '':
                raise sb.SequenceDataError("Sequence name or data is missing.")
        if l > len(set(names)):
            raise sb.SequenceDataError("Sequence names are not unique.")
        return

    fastaSeq = FaSeq()

    flag = False
    faFile = sb.gz_open(faFileName)
    if name is not None:
        fastaSeq.name = name
    else:
        fastaSeq.name = sb.stripFName(faFileName)
    # Find the start of the first sequence.
    for i in range(0, maxskip):
        line = faFile.readline()
        if line == '':
            raise NotAFastaFileError("File contains no data.")
        if line[0] == '>':
            # species name found in line
            flag = True
            break
    if flag is False:
        raise NotAFastaFileError("Didn't find a species header within " +
                                 maxskip + " lines.")
    while line is not None:
        (nextLine, seq) = read_seq_from_fo(line, faFile)
        line = nextLine
        fastaSeq.seqL.append(seq)
        fastaSeq.nSpecies += 1
    faFile.close()
    test_sequence(fastaSeq)
    return fastaSeq
Example #5
0
    def __init__(self, CFFileName, name=None):
        CFFile = sb.gz_open(CFFileName)
        # Set the cf sequence name.
        if name is None:
            name = sb.stripFName(CFFileName)
        # Find the start of the first base.
        ln = CFFile.readline()
        if ln == '':
            raise NotACountsFormatFileError("File contains no data.")

        # Skip comments.
        while ln[0] == '#':
            ln = CFFile.readline()

        # Read in first line.
        lnL = ln.split()
        l = len(lnL)
        if (lnL[0] != "COUNTSFILE") or (l != 5):
            raise NotACountsFormatFileError("First line is corrupt.")
        # TODO: The first line is needed by IQ-Tree, but not by
        # libPoMo.  Maybe I should use this information here!

        ln = CFFile.readline()

        # Skip comments.
        while ln[0] == '#':
            ln = CFFile.readline()

        # Read in headerline.
        lnL = ln.split('\t')
        l = len(lnL)
        indivL = []
        if (lnL[0] in ["CHROM", "Chrom"]) and (lnL[1] in ["POS", "Pos"]):
            for i in range(2, l):
                indivL.append(lnL[i].strip())
        else:
            raise NotACountsFormatFileError("Header line is corrupt.")
        ln = CFFile.readline()
        (chrom, pos, countsL) = interpret_cf_line(ln)
        if (len(countsL) != len(indivL)):
            raise NotACountsFormatFileError("Line doesn't fit nr. of species.")

        self.name = name
        self.chrom = chrom
        self.pos = pos
        self.fo = CFFile
        self.indivL = indivL
        self.countsL = countsL
        self.nIndiv = len(countsL)
Example #6
0
    def __init__(self, CFFileName, name=None):
        CFFile = sb.gz_open(CFFileName)
        # Set the cf sequence name.
        if name is None:
            name = sb.stripFName(CFFileName)
        # Find the start of the first base.
        ln = CFFile.readline()
        if ln == '':
            raise NotACountsFormatFileError("File contains no data.")

        # Skip comments.
        while ln[0] == '#':
            ln = CFFile.readline()

        # Read in first line.
        lnL = ln.split()
        l = len(lnL)
        if (lnL[0] != "COUNTSFILE") or (l != 5):
            raise NotACountsFormatFileError("First line is corrupt.")
        # TODO: The first line is needed by IQ-Tree, but not by
        # libPoMo.  Maybe I should use this information here!

        ln = CFFile.readline()

        # Skip comments.
        while ln[0] == '#':
            ln = CFFile.readline()

        # Read in headerline.
        lnL = ln.split('\t')
        l = len(lnL)
        indivL = []
        if (lnL[0] in ["CHROM", "Chrom"]) and (lnL[1] in ["POS", "Pos"]):
            for i in range(2, l):
                indivL.append(lnL[i].strip())
        else:
            raise NotACountsFormatFileError("Header line is corrupt.")
        ln = CFFile.readline()
        (chrom, pos, countsL) = interpret_cf_line(ln)
        if (len(countsL) != len(indivL)):
            raise NotACountsFormatFileError("Line doesn't fit nr. of species.")

        self.name = name
        self.chrom = chrom
        self.pos = pos
        self.fo = CFFile
        self.indivL = indivL
        self.countsL = countsL
        self.nIndiv = len(countsL)
Example #7
0
def init_seq(faFileName, maxskip=50, name=None):
    """Open a fasta file and initialize an :class:`FaStream`.

    This function tries to open the given fasta file, checks if it is
    in fasta format and reads the first sequence.  It returns an
    :class:`FaStream` object. This object can later be used to parse
    the whole fasta file.

    Please close the associated file object with
    :func:`FaStream.close` when you don't need it anymore.

    :param str faFileName: File name of the fasta file.
    :param int maxskip: Only look *maxskip* lines for the start of a
                        sequence (defaults to 50).
    :param str name: Set the name of the sequence to *name*, otherwise
                     set it to the stripped filename.

    """
    flag = False
    faFile = sb.gz_open(faFileName)
    if name is None:
        name = sb.stripFName(faFileName)
    # Find the start of the first sequence.
    for i in range(0, maxskip):
        line = faFile.readline()
        if line == '':
            raise NotAFastaFileError("File contains no data.")
        if line[0] == '>':
            # species name found in line
            flag = True
            break
    if flag is False:
        raise NotAFastaFileError("Didn't find a species header within " +
                                 maxskip + " lines.")
    (nextHL, seq) = read_seq_from_fo(line, faFile)
    try:
        nextHL = nextHL.rstrip()
    except:
        pass
    faStr = FaStream(name, seq, nextHL, faFile)
    return faStr
Example #8
0
File: vcf.py Project: pomo-dev/PoMo
def init_seq(VCFFileName, maxskip=100, name=None):
    """Open a (gzipped) VCF4.1 file.

    Try to open the given VCF file, checks if it is in VCF format.
    Initialize a :class:`VCFStream` object that contains the first
    base.

    Please close the associated file object with
    :func:`VCFStream.close` when you don't need it anymore.

    :param str VCFFileName: Name of the VCF file.
    :param int maxskip: Only look *maxskip* lines for the start of the
                        bases (defaults to 80).
    :param str name: Set the name of the sequence to *name*, otherwise
                     set it to the filename.

    """
    flag = False
    VCFFile = sb.gz_open(VCFFileName)
    # Set the vcf sequence name.
    if name is None:
        name = sb.stripFName(VCFFileName)
    # Find the start of the first base.
    for i in range(0, maxskip):
        line = VCFFile.readline()
        if line == '':
            raise NotAVariantCallFormatFileError("File contains no data.")
        if line[0:6] == '#CHROM':
            # Here starts the data.
            check_fixed_field_header(line)
            speciesL = get_indiv_from_field_header(line)
            flag = True
            break
    if flag is False:
        raise NotAVariantCallFormatFileError(
            "Didn't find any data within " + str(maxskip) + " lines.")
    line = VCFFile.readline()
    base = get_nuc_base_from_line(line, info=False)
    base.set_ploidy()
    return VCFStream(name, VCFFile, speciesL, base)
Example #9
0
def init_seq(VCFFileName, maxskip=100, name=None):
    """Open a (gzipped) VCF4.1 file.

    Try to open the given VCF file, checks if it is in VCF format.
    Initialize a :class:`VCFStream` object that contains the first
    base.

    Please close the associated file object with
    :func:`VCFStream.close` when you don't need it anymore.

    :param str VCFFileName: Name of the VCF file.
    :param int maxskip: Only look *maxskip* lines for the start of the
                        bases (defaults to 80).
    :param str name: Set the name of the sequence to *name*, otherwise
                     set it to the filename.

    """
    flag = False
    VCFFile = sb.gz_open(VCFFileName)
    # Set the vcf sequence name.
    if name is None:
        name = sb.stripFName(VCFFileName)
    # Find the start of the first base.
    for i in range(0, maxskip):
        line = VCFFile.readline()
        if line == '':
            raise NotAVariantCallFormatFileError("File contains no data.")
        if line[0:6] == '#CHROM':
            # Here starts the data.
            check_fixed_field_header(line)
            speciesL = get_indiv_from_field_header(line)
            flag = True
            break
    if flag is False:
        raise NotAVariantCallFormatFileError("Didn't find any data within " +
                                             str(maxskip) + " lines.")
    line = VCFFile.readline()
    base = get_nuc_base_from_line(line, info=False)
    base.set_ploidy()
    return VCFStream(name, VCFFile, speciesL, base)