Python gz_open Beispiele, cflib.seqbase.gz_open Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: cf.py Projekt: morganmuell/cflib

    def close(self):
        """Write file type specifier, number of populations and number of
           sites to the beginning of the output file.  Close
           fileobjects.

        """
        for tf in self.vcfTfL:
            tf.close()
        self.outFO.close()

        # Insert the first line.  TODO: The whole file needs to be
        # copied, maybe there is a better method?
        temp_fn = "temp_" + os.path.basename(self.outFN)
        temp_fd = os.path.dirname(self.outFN)
        temp_path = os.path.join(temp_fd, temp_fn)
        fo = sb.gz_open(temp_path, mode='w')
        print("COUNTSFILE NPOP",
              self.nPop,
              "NSITES",
              self.baseCounter,
              file=fo)
        with sb.gz_open(self.outFN, mode='r') as f:
            for ln in f:
                print(ln, file=fo, end='')
        fo.close()
        os.rename(temp_path, self.outFN)

Beispiel #2

0

Datei anzeigen

Datei: fasta.py Projekt: morganmuell/cflib

    def __init__(self, faFileName, maxskip=50, name=None):
        """Open a fasta file and initialize :class:`MFaStream`."""
        def add_instance_variables(name, firstSeqL, nextHL, faFileObject):
            """Add state objects."""
            self.name = name
            self.seqL = firstSeqL
            self.nSpecies = len(self.seqL)
            self.nextHeaderLine = nextHL
            self.fo = faFileObject

        flag = False
        faFile = sb.gz_open(faFileName)
        if name is None:
            name = sb.stripFName(faFileName)
        # Find the start of the first sequence.
        for i in range(0, maxskip):
            line = faFile.readline()
            if line == '':
                raise NotAFastaFileError("File contains no data.")
            if line[0] == '>':
                # species name found in line
                flag = True
                break
        if flag is False:
            raise NotAFastaFileError("Didn't find a species header within " +
                                     maxskip + " lines.")
        (nextHL, seqL) = read_align_from_fo(line, faFile)
        try:
            nextHL = nextHL.rstrip()
        except:
            pass
        add_instance_variables(name, seqL, nextHL, faFile)

Beispiel #3

0

Datei anzeigen

Datei: fasta.py Projekt: pomo-dev/cflib

    def __init__(self, faFileName, maxskip=50, name=None):
        """Open a fasta file and initialize :class:`MFaStream`."""
        def add_instance_variables(name, firstSeqL, nextHL, faFileObject):
            """Add state objects."""
            self.name = name
            self.seqL = firstSeqL
            self.nSpecies = len(self.seqL)
            self.nextHeaderLine = nextHL
            self.fo = faFileObject

        flag = False
        faFile = sb.gz_open(faFileName)
        if name is None:
            name = sb.stripFName(faFileName)
        # Find the start of the first sequence.
        for i in range(0, maxskip):
            line = faFile.readline()
            if line == '':
                raise NotAFastaFileError("File contains no data.")
            if line[0] == '>':
                # species name found in line
                flag = True
                break
        if flag is False:
            raise NotAFastaFileError("Didn't find a species header within " +
                                     maxskip + " lines.")
        (nextHL, seqL) = read_align_from_fo(line, faFile)
        try:
            nextHL = nextHL.rstrip()
        except:
            pass
        add_instance_variables(name, seqL, nextHL, faFile)

Beispiel #4

0

Datei anzeigen

Datei: vcf.py Projekt: morganmuell/cflib

def open_seq(VCFFileName, maxskip=100, name=None):
    """Open a VCF4.1 file.

    Try to open the given VCF file, checks if it is in VCF format and
    reads the bases(s).  It returns an :class:`VCFSeq` object that
    contains all the information.

    :param str VCFFileName: Name of the VCF file.
    :param int maxskip: Only look *maxskip* lines for the start of the
                        bases (defaults to 80).
    :param str name: Set the name of the sequence to *name*, otherwise
                     set it to the filename.

    """
    def test_sequence(seq):
        """ Test a given VCF sequence.

        TODO: implement this.

        :param seq:
        :returns:
        :rtype:

        """
        pass

    seq = VCFSeq()
    seq.header = ""

    flag = False
    VCFFile = sb.gz_open(VCFFileName)
    # set the vcf sequence name
    if name is not None:
        seq.name = name
    else:
        seq.name = sb.stripFName(VCFFileName)
    # Find the start of the first base
    for i in range(0, maxskip):
        line = VCFFile.readline()
        if line == '':
            raise NotAVariantCallFormatFileError("File contains no data.")
        if line[0:2] == '##':
            seq.header += line
        if line[0:6] == '#CHROM':
            # Here starts the data.
            check_fixed_field_header(line)
            seq.speciesL = get_indiv_from_field_header(line)
            seq.nSpecies = len(seq.speciesL)
            flag = True
            break
    if flag is False:
        raise NotAVariantCallFormatFileError(
            "Didn't find any data within " + str(maxskip) + " lines.")
    for line in VCFFile:
        base = get_nuc_base_from_line(line)
        seq.append_nuc_base(base)

    VCFFile.close()
    test_sequence(seq)
    return seq

Beispiel #5

0

Datei anzeigen

Datei: cf.py Projekt: morganmuell/cflib

    def __init_outFO(self):
        """Open *self.outFN*.

        If the file name ends with "gz", the outfile will be
        compressed and is opened with gzip.open().

        """
        self.outFO = sb.gz_open(self.outFN, mode='w')

Beispiel #6

0

Datei anzeigen

Datei: cf.py Projekt: pomo-dev/cflib

    def __init_outFO(self):
        """Open *self.outFN*.

        If the file name ends with "gz", the outfile will be
        compressed and is opened with gzip.open().

        """
        self.outFO = sb.gz_open(self.outFN, mode='w')

Beispiel #7

0

Datei anzeigen

Datei: fasta.py Projekt: pomo-dev/cflib

def open_seq(faFileName, maxskip=50, name=None):
    """Open and read a fasta file.

    This function tries to open the given fasta file, checks if it is
    in fasta format and reads the sequence(s).  It returns an
    :class:`FaSeq` object that contains a list of species names, a
    list of the respective desriptions and a list with the sequences.

    :param str faFileName: Name of the fasta file.
    :param int maxskip: Only look *maxskip* lines for the start of a sequence
                        (defaults to 50).
    :param str name: Set the name of the sequence to *name* otherwise
                     set it to the stripped filename.

    """
    def test_sequence(faSequence):
        """Tests if sequences contain data."""
        l = faSequence.nSpecies
        names = []
        for i in range(l):
            names.append(faSequence.seqL[i].name)
            if faSequence.seqL[i].name == '' or faSequence.seqL[i].data == '':
                raise sb.SequenceDataError("Sequence name or data is missing.")
        if l > len(set(names)):
            raise sb.SequenceDataError("Sequence names are not unique.")
        return

    fastaSeq = FaSeq()

    flag = False
    faFile = sb.gz_open(faFileName)
    if name is not None:
        fastaSeq.name = name
    else:
        fastaSeq.name = sb.stripFName(faFileName)
    # Find the start of the first sequence.
    for i in range(0, maxskip):
        line = faFile.readline()
        if line == '':
            raise NotAFastaFileError("File contains no data.")
        if line[0] == '>':
            # species name found in line
            flag = True
            break
    if flag is False:
        raise NotAFastaFileError("Didn't find a species header within " +
                                 maxskip + " lines.")
    while line is not None:
        (nextLine, seq) = read_seq_from_fo(line, faFile)
        line = nextLine
        fastaSeq.seqL.append(seq)
        fastaSeq.nSpecies += 1
    faFile.close()
    test_sequence(fastaSeq)

    for s in fastaSeq.seqL:
        fastaSeq.seqD[s.name] = s
    return fastaSeq

Beispiel #8

0

Datei anzeigen

Datei: fasta.py Projekt: morganmuell/cflib

def open_seq(faFileName, maxskip=50, name=None):
    """Open and read a fasta file.

    This function tries to open the given fasta file, checks if it is
    in fasta format and reads the sequence(s).  It returns an
    :class:`FaSeq` object that contains a list of species names, a
    list of the respective desriptions and a list with the sequences.

    :param str faFileName: Name of the fasta file.
    :param int maxskip: Only look *maxskip* lines for the start of a sequence
                        (defaults to 50).
    :param str name: Set the name of the sequence to *name* otherwise
                     set it to the stripped filename.

    """
    def test_sequence(faSequence):
        """Tests if sequences contain data."""
        length = faSequence.nSpecies
        names = []
        for i in range(length):
            names.append(faSequence.seqL[i].name)
            if faSequence.seqL[i].name == '' or faSequence.seqL[i].data == '':
                raise sb.SequenceDataError("Sequence name or data is missing.")
        if length > len(set(names)):
            raise sb.SequenceDataError("Sequence names are not unique.")
        return

    fastaSeq = FaSeq()

    flag = False
    faFile = sb.gz_open(faFileName)
    if name is not None:
        fastaSeq.name = name
    else:
        fastaSeq.name = sb.stripFName(faFileName)
    # Find the start of the first sequence.
    for i in range(0, maxskip):
        line = faFile.readline()
        if line == '':
            raise NotAFastaFileError("File contains no data.")
        if line[0] == '>':
            # species name found in line
            flag = True
            break
    if flag is False:
        raise NotAFastaFileError("Didn't find a species header within " +
                                 maxskip + " lines.")
    while line is not None:
        (nextLine, seq) = read_seq_from_fo(line, faFile)
        line = nextLine
        fastaSeq.seqL.append(seq)
        fastaSeq.nSpecies += 1
    faFile.close()
    test_sequence(fastaSeq)

    for s in fastaSeq.seqL:
        fastaSeq.seqD[s.name] = s
    return fastaSeq

Beispiel #9

0

Datei anzeigen

Datei: cf.py Projekt: pomo-dev/cflib

    def close(self):
        """Write file type specifier, number of populations and number of
           sites to the beginning of the output file.  Close
           fileobjects.

        """
        for tf in self.vcfTfL:
            tf.close()
        self.outFO.close()

        # Insert the first line.  TODO: The whole file needs to be
        # copied, maybe there is a better method?
        temp_fn = "temp_" + os.path.basename(self.outFN)
        temp_fd = os.path.dirname(self.outFN)
        temp_path = os.path.join(temp_fd, temp_fn)
        fo = sb.gz_open(temp_path, mode='w')
        print("COUNTSFILE NPOP", self.nPop, "NSITES",
              self.baseCounter, file=fo)
        with sb.gz_open(self.outFN, mode='r') as f:
            for ln in f:
                print(ln, file=fo, end='')
        fo.close()
        os.rename(temp_path, self.outFN)

Beispiel #10

0

Datei anzeigen

Datei: cf.py Projekt: morganmuell/cflib

    def __init__(self, CFFileName, name=None):
        CFFile = sb.gz_open(CFFileName)
        # Set the cf sequence name.
        if name is None:
            name = sb.stripFName(CFFileName)
        # Find the start of the first base.
        ln = CFFile.readline()
        if ln == '':
            raise NotACountsFormatFileError("File contains no data.")

        # Skip comments.
        while ln[0] == '#':
            ln = CFFile.readline()

        # Read in first line.
        lnL = ln.split()
        length = len(lnL)
        if (lnL[0] != "COUNTSFILE") or (length != 5):
            raise NotACountsFormatFileError("First line is corrupt.")
        # TODO: The first line is needed by IQ-Tree, but not by
        # cflib.  Maybe I should use this information here!

        ln = CFFile.readline()

        # Skip comments.
        while ln[0] == '#':
            ln = CFFile.readline()

        # Read in headerline.
        lnL = ln.split()
        length = len(lnL)
        indivL = []
        if (lnL[0] in ["CHROM", "Chrom"]) and (lnL[1] in ["POS", "Pos"]):
            for i in range(2, length):
                indivL.append(lnL[i].strip())
        else:
            raise NotACountsFormatFileError("Header line is corrupt.")
        ln = CFFile.readline()
        (chrom, pos, countsL) = interpret_cf_line(ln)
        if (len(countsL) != len(indivL)):
            raise NotACountsFormatFileError("Line doesn't fit nr. of species.")

        self.name = name
        self.chrom = chrom
        self.pos = pos
        self.fo = CFFile
        self.indivL = indivL
        self.countsL = countsL
        self.nIndiv = len(countsL)

Beispiel #11

0

Datei anzeigen

Datei: cf.py Projekt: pomo-dev/cflib

    def __init__(self, CFFileName, name=None):
        CFFile = sb.gz_open(CFFileName)
        # Set the cf sequence name.
        if name is None:
            name = sb.stripFName(CFFileName)
        # Find the start of the first base.
        ln = CFFile.readline()
        if ln == '':
            raise NotACountsFormatFileError("File contains no data.")

        # Skip comments.
        while ln[0] == '#':
            ln = CFFile.readline()

        # Read in first line.
        lnL = ln.split()
        l = len(lnL)
        if (lnL[0] != "COUNTSFILE") or (l != 5):
            raise NotACountsFormatFileError("First line is corrupt.")
        # TODO: The first line is needed by IQ-Tree, but not by
        # cflib.  Maybe I should use this information here!

        ln = CFFile.readline()

        # Skip comments.
        while ln[0] == '#':
            ln = CFFile.readline()

        # Read in headerline.
        lnL = ln.split()
        l = len(lnL)
        indivL = []
        if (lnL[0] in ["CHROM", "Chrom"]) and (lnL[1] in ["POS", "Pos"]):
            for i in range(2, l):
                indivL.append(lnL[i].strip())
        else:
            raise NotACountsFormatFileError("Header line is corrupt.")
        ln = CFFile.readline()
        (chrom, pos, countsL) = interpret_cf_line(ln)
        if (len(countsL) != len(indivL)):
            raise NotACountsFormatFileError("Line doesn't fit nr. of species.")

        self.name = name
        self.chrom = chrom
        self.pos = pos
        self.fo = CFFile
        self.indivL = indivL
        self.countsL = countsL
        self.nIndiv = len(countsL)

Beispiel #12

0

Datei anzeigen

Datei: fasta.py Projekt: pomo-dev/cflib

def init_seq(faFileName, maxskip=50, name=None):
    """Open a fasta file and initialize an :class:`FaStream`.

    This function tries to open the given fasta file, checks if it is
    in fasta format and reads the first sequence.  It returns an
    :class:`FaStream` object. This object can later be used to parse
    the whole fasta file.

    Please close the associated file object with
    :func:`FaStream.close` when you don't need it anymore.

    :param str faFileName: File name of the fasta file.
    :param int maxskip: Only look *maxskip* lines for the start of a
                        sequence (defaults to 50).
    :param str name: Set the name of the sequence to *name*, otherwise
                     set it to the stripped filename.

    """
    flag = False
    faFile = sb.gz_open(faFileName)
    if name is None:
        name = sb.stripFName(faFileName)
    # Find the start of the first sequence.
    for i in range(0, maxskip):
        line = faFile.readline()
        if line == '':
            raise NotAFastaFileError("File contains no data.")
        if line[0] == '>':
            # species name found in line
            flag = True
            break
    if flag is False:
        raise NotAFastaFileError("Didn't find a species header within " +
                                 maxskip + " lines.")
    (nextHL, seq) = read_seq_from_fo(line, faFile)
    try:
        nextHL = nextHL.rstrip()
    except:
        pass
    faStr = FaStream(name, seq, nextHL, faFile)
    return faStr

Beispiel #13

0

Datei anzeigen

Datei: fasta.py Projekt: morganmuell/cflib

def init_seq(faFileName, maxskip=50, name=None):
    """Open a fasta file and initialize an :class:`FaStream`.

    This function tries to open the given fasta file, checks if it is
    in fasta format and reads the first sequence.  It returns an
    :class:`FaStream` object. This object can later be used to parse
    the whole fasta file.

    Please close the associated file object with
    :func:`FaStream.close` when you don't need it anymore.

    :param str faFileName: File name of the fasta file.
    :param int maxskip: Only look *maxskip* lines for the start of a
                        sequence (defaults to 50).
    :param str name: Set the name of the sequence to *name*, otherwise
                     set it to the stripped filename.

    """
    flag = False
    faFile = sb.gz_open(faFileName)
    if name is None:
        name = sb.stripFName(faFileName)
    # Find the start of the first sequence.
    for i in range(0, maxskip):
        line = faFile.readline()
        if line == '':
            raise NotAFastaFileError("File contains no data.")
        if line[0] == '>':
            # species name found in line
            flag = True
            break
    if flag is False:
        raise NotAFastaFileError("Didn't find a species header within " +
                                 maxskip + " lines.")
    (nextHL, seq) = read_seq_from_fo(line, faFile)
    try:
        nextHL = nextHL.rstrip()
    except:
        pass
    faStr = FaStream(name, seq, nextHL, faFile)
    return faStr

Beispiel #14

0

Datei anzeigen

Datei: vcf.py Projekt: morganmuell/cflib

def init_seq(VCFFileName, maxskip=100, name=None):
    """Open a (gzipped) VCF4.1 file.

    Try to open the given VCF file, checks if it is in VCF format.
    Initialize a :class:`VCFStream` object that contains the first
    base.

    Please close the associated file object with
    :func:`VCFStream.close` when you don't need it anymore.

    :param str VCFFileName: Name of the VCF file.
    :param int maxskip: Only look *maxskip* lines for the start of the
                        bases (defaults to 80).
    :param str name: Set the name of the sequence to *name*, otherwise
                     set it to the filename.

    """
    flag = False
    VCFFile = sb.gz_open(VCFFileName)
    # Set the vcf sequence name.
    if name is None:
        name = sb.stripFName(VCFFileName)
    # Find the start of the first base.
    for i in range(0, maxskip):
        line = VCFFile.readline()
        if line == '':
            raise NotAVariantCallFormatFileError("File contains no data.")
        if line[0:6] == '#CHROM':
            # Here starts the data.
            check_fixed_field_header(line)
            speciesL = get_indiv_from_field_header(line)
            flag = True
            break
    if flag is False:
        raise NotAVariantCallFormatFileError(
            "Didn't find any data within " + str(maxskip) + " lines.")
    line = VCFFile.readline()
    base = get_nuc_base_from_line(line, info=False)
    base.set_ploidy()
    return VCFStream(name, VCFFile, speciesL, base)

Beispiel #15

0

Datei anzeigen

Datei: FilterMSA.py Projekt: morganmuell/cflib

parser.add_argument("msaFile",
                    help="path to (gzipped) multiple sequence alignment file")
parser.add_argument("nSpecies",
                    help="number of aligned species in the given alignment")
parser.add_argument("output", help="name of (gzipped) msa output file")
parser.add_argument('-v',
                    "--verbosity",
                    action="count",
                    help="turn on verbosity")
args = parser.parse_args()

mfaFN = args.msaFile
nSpecies = int(args.nSpecies)
output = args.output
vb = args.verbosity

mfa = fa.MFaStream(mfaFN)
fp = fa.MFaStrFilterProps(nSpecies)

oF = sb.gz_open(output, mode='w')

while True:
    if fa.filter_mfa_str(mfa, fp, vb) is True:
        mfa.print_msa(fo=oF)
    if mfa.read_next_align() is None:
        break

oF.close()
mfa.close()

Beispiel #16

0

Datei anzeigen

Datei: fasta.py Projekt: morganmuell/cflib

def save_as_vcf(faSeq, ref, VCFFileName):
    """Save the given :classL`FaSeq` in VCF format.

    In general, we want to convert a fasta file with various
    individuals with the help of a reference that contains one
    sequence to a VCF file that contains all the SNPs.  This can be
    done with this function.  Until now it is not possible to do this
    conversion for several chromosomes for each individual in one run.
    Still, the conversion can be done chromosome by chromosome.

    This function saves the SNPs of *faSeq*, a given :class:`FaSeq`
    (fasta sequence) object in VCF format to the file *VCFFileName*.
    The reference genome *ref*, to which *faSeq* is compared to, needs
    to be passed as a :class:`Seq <cflib.seqbase.Seq>` object.

    The function compares all sequences in *faSeq* to the sequence
    given in *ref*.  The names of the individuals in the saved VCF
    file will be the sequence names of the *faSeq* object.

    ::

      #CHROM = sequence name of the reference
      POS    = position relative to reference
      ID     = .
      REF    = base of reference
      ALT    = SNP (e.g. 'C' or 'G,T' if 2 different SNPs are present)
      QUAL   = .
      FILTER = .
      INFO   = .
      FORMAT = GT

    :param FaSeq faSeq: :class:`FaSeq` object to be converted.
    :param Seq ref: :class:`Seq <cflib.seqbase.Seq>` object of the
                    reference sequence.
    :param str VCFFileName: Name of the VCF output file.

    """
    def get_altBases_string(sAltBases):
        """Return ALT bases string from given `sAltBases`."""
        length = len(sAltBases)
        if length == 0:
            return ''
        string = str(sAltBases[0])
        if length > 1:
            for i in range(1, length):
                string += ',' + sAltBases[i]
        return string

    def get_indiv_string(indivData, altBases, sAltBases):
        """Return the string of the individual data.

        Return the string extracted from the indivudal data
        `indivData` with SNPs `altBases`. `sAltBases` is the string
        with the alternative bases.

        E.g.:
        REF = A
        ALT = C,G
        individual i1 has A
        individual i2 has C
        individual i3 has G

        Then the string should look like:
        '0\t1\t2'
        -> 0 for REF, 1 for first ALT and 2 for second ALT

        """
        length = len(indivData)
        if not (indivData[0] in altBases):
            string = '0'
        else:
            string = str(sAltBases.index(indivData[0]) + 1)
        if length > 1:
            for i in range(1, len(indivData)):
                if not (indivData[i] in altBases):
                    string += '\t' + '0'
                else:
                    string += '\t' + str(sAltBases.index(indivData[i]) + 1)
        return string

    def get_vcf_line(chromName, pos, refBase, altBaseString, indivString):
        """Print a VCF file line with given data to file `VCFFile`."""
        string = chromName + '\t'
        string += str(pos) + '\t'
        string += '.' + '\t'  # id
        string += refBase + '\t'
        string += altBaseString + '\t'
        string += '.' + '\t'  # qual
        string += '.' + '\t'  # filter
        string += '.' + '\t'  # info
        string += "GT" + '\t'  # format
        string += indivString
        return string

    if (not isinstance(faSeq, FaSeq)):
        raise sb.SequenceDataError("`faSeq` is not an FaSeq object.")
    if (not isinstance(ref, sb.Seq)):
        raise sb.SequenceDataError("`ref` is not a Seq object.")
    if faSeq.nSpecies == 0:
        raise sb.SequenceDataError("`faSeq` has no saved sequences.")
    for i in range(0, faSeq.nSpecies):
        if faSeq.seqL[i].dataLen != ref.dataLen:
            raise sb.SequenceDataError("Sequence " + faSeq.seqL[i].name +
                                       " has different length than reference.")
    VCFFile = sb.gz_open(VCFFileName, mode='w')
    print(vcf.get_header_line_string(faSeq.get_seq_names()), file=VCFFile)
    # loop over bases
    refBase = ''
    for i in range(0, ref.dataLen):
        refBase = ref.data[i]
        altBases = set()
        indivData = []
        # loop over sequences in faSeq and check if there is a SNP
        for s in range(0, faSeq.nSpecies):
            indivData.append(faSeq.seqL[s].data[i])
            if faSeq.seqL[s].data[i] != refBase:
                altBases.add(faSeq.seqL[s].data[i])
        sAltBases = sorted(altBases)
        altBaseString = get_altBases_string(sAltBases)
        indivString = get_indiv_string(indivData, altBases, sAltBases)
        if altBases != set():
            print(get_vcf_line(ref.name, i + 1, refBase, altBaseString,
                               indivString),
                  file=VCFFile)
    VCFFile.close()
    return

Beispiel #17

0

Datei anzeigen

Datei: fasta.py Projekt: pomo-dev/cflib

def save_as_vcf(faSeq, ref, VCFFileName):
    """Save the given :classL`FaSeq` in VCF format.

    In general, we want to convert a fasta file with various
    individuals with the help of a reference that contains one
    sequence to a VCF file that contains all the SNPs.  This can be
    done with this function.  Until now it is not possible to do this
    conversion for several chromosomes for each individual in one run.
    Still, the conversion can be done chromosome by chromosome.

    This function saves the SNPs of *faSeq*, a given :class:`FaSeq`
    (fasta sequence) object in VCF format to the file *VCFFileName*.
    The reference genome *ref*, to which *faSeq* is compared to, needs
    to be passed as a :class:`Seq <cflib.seqbase.Seq>` object.

    The function compares all sequences in *faSeq* to the sequence
    given in *ref*.  The names of the individuals in the saved VCF
    file will be the sequence names of the *faSeq* object.

    ::

      #CHROM = sequence name of the reference
      POS    = position relative to reference
      ID     = .
      REF    = base of reference
      ALT    = SNP (e.g. 'C' or 'G,T' if 2 different SNPs are present)
      QUAL   = .
      FILTER = .
      INFO   = .
      FORMAT = GT

    :param FaSeq faSeq: :class:`FaSeq` object to be converted.
    :param Seq ref: :class:`Seq <cflib.seqbase.Seq>` object of the
                    reference sequence.
    :param str VCFFileName: Name of the VCF output file.

    """
    def get_altBases_string(sAltBases):
        """Return ALT bases string from given `sAltBases`."""
        l = len(sAltBases)
        if l == 0:
            return ''
        string = str(sAltBases[0])
        if l > 1:
            for i in range(1, l):
                string += ',' + sAltBases[i]
        return string

    def get_indiv_string(indivData, altBases, sAltBases):
        """Return the string of the individual data.

        Return the string extracted from the indivudal data
        `indivData` with SNPs `altBases`. `sAltBases` is the string
        with the alternative bases.

        E.g.:
        REF = A
        ALT = C,G
        individual i1 has A
        individual i2 has C
        individual i3 has G

        Then the string should look like:
        '0\t1\t2'
        -> 0 for REF, 1 for first ALT and 2 for second ALT

        """
        l = len(indivData)
        if not (indivData[0] in altBases):
            string = '0'
        else:
            string = str(sAltBases.index(indivData[0]) + 1)
        if l > 1:
            for i in range(1, len(indivData)):
                if not (indivData[i] in altBases):
                    string += '\t' + '0'
                else:
                    string += '\t' + str(sAltBases.index(indivData[i]) + 1)
        return string

    def get_vcf_line(chromName, pos,
                     refBase, altBaseString, indivString):
        """Print a VCF file line with given data to file `VCFFile`."""
        string = chromName + '\t'
        string += str(pos) + '\t'
        string += '.' + '\t'    # id
        string += refBase + '\t'
        string += altBaseString + '\t'
        string += '.' + '\t'    # qual
        string += '.' + '\t'    # filter
        string += '.' + '\t'    # info
        string += "GT" + '\t'   # format
        string += indivString
        return string

    if (not isinstance(faSeq, FaSeq)):
        raise sb.SequenceDataError("`faSeq` is not an FaSeq object.")
    if (not isinstance(ref, sb.Seq)):
        raise sb.SequenceDataError("`ref` is not a Seq object.")
    if faSeq.nSpecies == 0:
        raise sb.SequenceDataError("`faSeq` has no saved sequences.")
    for i in range(0, faSeq.nSpecies):
        if faSeq.seqL[i].dataLen != ref.dataLen:
            raise sb.SequenceDataError(
                "Sequence " + faSeq.seqL[i].name +
                " has different length than reference.")
    VCFFile = sb.gz_open(VCFFileName, mode='w')
    print(vcf.get_header_line_string(faSeq.get_seq_names()), file=VCFFile)
    # loop over bases
    refBase = ''
    for i in range(0, ref.dataLen):
        refBase = ref.data[i]
        altBases = set()
        indivData = []
        # loop over sequences in faSeq and check if there is a SNP
        for s in range(0, faSeq.nSpecies):
            indivData.append(faSeq.seqL[s].data[i])
            if faSeq.seqL[s].data[i] != refBase:
                altBases.add(faSeq.seqL[s].data[i])
        sAltBases = sorted(altBases)
        altBaseString = get_altBases_string(sAltBases)
        indivString = get_indiv_string(indivData, altBases, sAltBases)
        if altBases != set():
            print(
                get_vcf_line(ref.name, i+1, refBase,
                             altBaseString, indivString),
                file=VCFFile)
    VCFFile.close()
    return