def close(self): """Write file type specifier, number of populations and number of sites to the beginning of the output file. Close fileobjects. """ for tf in self.vcfTfL: tf.close() self.outFO.close() # Insert the first line. TODO: The whole file needs to be # copied, maybe there is a better method? temp_fn = "temp_" + os.path.basename(self.outFN) temp_fd = os.path.dirname(self.outFN) temp_path = os.path.join(temp_fd, temp_fn) fo = sb.gz_open(temp_path, mode='w') print("COUNTSFILE NPOP", self.nPop, "NSITES", self.baseCounter, file=fo) with sb.gz_open(self.outFN, mode='r') as f: for ln in f: print(ln, file=fo, end='') fo.close() os.rename(temp_path, self.outFN)
def __init__(self, faFileName, maxskip=50, name=None): """Open a fasta file and initialize :class:`MFaStream`.""" def add_instance_variables(name, firstSeqL, nextHL, faFileObject): """Add state objects.""" self.name = name self.seqL = firstSeqL self.nSpecies = len(self.seqL) self.nextHeaderLine = nextHL self.fo = faFileObject flag = False faFile = sb.gz_open(faFileName) if name is None: name = sb.stripFName(faFileName) # Find the start of the first sequence. for i in range(0, maxskip): line = faFile.readline() if line == '': raise NotAFastaFileError("File contains no data.") if line[0] == '>': # species name found in line flag = True break if flag is False: raise NotAFastaFileError("Didn't find a species header within " + maxskip + " lines.") (nextHL, seqL) = read_align_from_fo(line, faFile) try: nextHL = nextHL.rstrip() except: pass add_instance_variables(name, seqL, nextHL, faFile)
def open_seq(VCFFileName, maxskip=100, name=None): """Open a VCF4.1 file. Try to open the given VCF file, checks if it is in VCF format and reads the bases(s). It returns an :class:`VCFSeq` object that contains all the information. :param str VCFFileName: Name of the VCF file. :param int maxskip: Only look *maxskip* lines for the start of the bases (defaults to 80). :param str name: Set the name of the sequence to *name*, otherwise set it to the filename. """ def test_sequence(seq): """ Test a given VCF sequence. TODO: implement this. :param seq: :returns: :rtype: """ pass seq = VCFSeq() seq.header = "" flag = False VCFFile = sb.gz_open(VCFFileName) # set the vcf sequence name if name is not None: seq.name = name else: seq.name = sb.stripFName(VCFFileName) # Find the start of the first base for i in range(0, maxskip): line = VCFFile.readline() if line == '': raise NotAVariantCallFormatFileError("File contains no data.") if line[0:2] == '##': seq.header += line if line[0:6] == '#CHROM': # Here starts the data. check_fixed_field_header(line) seq.speciesL = get_indiv_from_field_header(line) seq.nSpecies = len(seq.speciesL) flag = True break if flag is False: raise NotAVariantCallFormatFileError( "Didn't find any data within " + str(maxskip) + " lines.") for line in VCFFile: base = get_nuc_base_from_line(line) seq.append_nuc_base(base) VCFFile.close() test_sequence(seq) return seq
def __init_outFO(self): """Open *self.outFN*. If the file name ends with "gz", the outfile will be compressed and is opened with gzip.open(). """ self.outFO = sb.gz_open(self.outFN, mode='w')
def open_seq(faFileName, maxskip=50, name=None): """Open and read a fasta file. This function tries to open the given fasta file, checks if it is in fasta format and reads the sequence(s). It returns an :class:`FaSeq` object that contains a list of species names, a list of the respective desriptions and a list with the sequences. :param str faFileName: Name of the fasta file. :param int maxskip: Only look *maxskip* lines for the start of a sequence (defaults to 50). :param str name: Set the name of the sequence to *name* otherwise set it to the stripped filename. """ def test_sequence(faSequence): """Tests if sequences contain data.""" l = faSequence.nSpecies names = [] for i in range(l): names.append(faSequence.seqL[i].name) if faSequence.seqL[i].name == '' or faSequence.seqL[i].data == '': raise sb.SequenceDataError("Sequence name or data is missing.") if l > len(set(names)): raise sb.SequenceDataError("Sequence names are not unique.") return fastaSeq = FaSeq() flag = False faFile = sb.gz_open(faFileName) if name is not None: fastaSeq.name = name else: fastaSeq.name = sb.stripFName(faFileName) # Find the start of the first sequence. for i in range(0, maxskip): line = faFile.readline() if line == '': raise NotAFastaFileError("File contains no data.") if line[0] == '>': # species name found in line flag = True break if flag is False: raise NotAFastaFileError("Didn't find a species header within " + maxskip + " lines.") while line is not None: (nextLine, seq) = read_seq_from_fo(line, faFile) line = nextLine fastaSeq.seqL.append(seq) fastaSeq.nSpecies += 1 faFile.close() test_sequence(fastaSeq) for s in fastaSeq.seqL: fastaSeq.seqD[s.name] = s return fastaSeq
def open_seq(faFileName, maxskip=50, name=None): """Open and read a fasta file. This function tries to open the given fasta file, checks if it is in fasta format and reads the sequence(s). It returns an :class:`FaSeq` object that contains a list of species names, a list of the respective desriptions and a list with the sequences. :param str faFileName: Name of the fasta file. :param int maxskip: Only look *maxskip* lines for the start of a sequence (defaults to 50). :param str name: Set the name of the sequence to *name* otherwise set it to the stripped filename. """ def test_sequence(faSequence): """Tests if sequences contain data.""" length = faSequence.nSpecies names = [] for i in range(length): names.append(faSequence.seqL[i].name) if faSequence.seqL[i].name == '' or faSequence.seqL[i].data == '': raise sb.SequenceDataError("Sequence name or data is missing.") if length > len(set(names)): raise sb.SequenceDataError("Sequence names are not unique.") return fastaSeq = FaSeq() flag = False faFile = sb.gz_open(faFileName) if name is not None: fastaSeq.name = name else: fastaSeq.name = sb.stripFName(faFileName) # Find the start of the first sequence. for i in range(0, maxskip): line = faFile.readline() if line == '': raise NotAFastaFileError("File contains no data.") if line[0] == '>': # species name found in line flag = True break if flag is False: raise NotAFastaFileError("Didn't find a species header within " + maxskip + " lines.") while line is not None: (nextLine, seq) = read_seq_from_fo(line, faFile) line = nextLine fastaSeq.seqL.append(seq) fastaSeq.nSpecies += 1 faFile.close() test_sequence(fastaSeq) for s in fastaSeq.seqL: fastaSeq.seqD[s.name] = s return fastaSeq
def __init__(self, CFFileName, name=None): CFFile = sb.gz_open(CFFileName) # Set the cf sequence name. if name is None: name = sb.stripFName(CFFileName) # Find the start of the first base. ln = CFFile.readline() if ln == '': raise NotACountsFormatFileError("File contains no data.") # Skip comments. while ln[0] == '#': ln = CFFile.readline() # Read in first line. lnL = ln.split() length = len(lnL) if (lnL[0] != "COUNTSFILE") or (length != 5): raise NotACountsFormatFileError("First line is corrupt.") # TODO: The first line is needed by IQ-Tree, but not by # cflib. Maybe I should use this information here! ln = CFFile.readline() # Skip comments. while ln[0] == '#': ln = CFFile.readline() # Read in headerline. lnL = ln.split() length = len(lnL) indivL = [] if (lnL[0] in ["CHROM", "Chrom"]) and (lnL[1] in ["POS", "Pos"]): for i in range(2, length): indivL.append(lnL[i].strip()) else: raise NotACountsFormatFileError("Header line is corrupt.") ln = CFFile.readline() (chrom, pos, countsL) = interpret_cf_line(ln) if (len(countsL) != len(indivL)): raise NotACountsFormatFileError("Line doesn't fit nr. of species.") self.name = name self.chrom = chrom self.pos = pos self.fo = CFFile self.indivL = indivL self.countsL = countsL self.nIndiv = len(countsL)
def __init__(self, CFFileName, name=None): CFFile = sb.gz_open(CFFileName) # Set the cf sequence name. if name is None: name = sb.stripFName(CFFileName) # Find the start of the first base. ln = CFFile.readline() if ln == '': raise NotACountsFormatFileError("File contains no data.") # Skip comments. while ln[0] == '#': ln = CFFile.readline() # Read in first line. lnL = ln.split() l = len(lnL) if (lnL[0] != "COUNTSFILE") or (l != 5): raise NotACountsFormatFileError("First line is corrupt.") # TODO: The first line is needed by IQ-Tree, but not by # cflib. Maybe I should use this information here! ln = CFFile.readline() # Skip comments. while ln[0] == '#': ln = CFFile.readline() # Read in headerline. lnL = ln.split() l = len(lnL) indivL = [] if (lnL[0] in ["CHROM", "Chrom"]) and (lnL[1] in ["POS", "Pos"]): for i in range(2, l): indivL.append(lnL[i].strip()) else: raise NotACountsFormatFileError("Header line is corrupt.") ln = CFFile.readline() (chrom, pos, countsL) = interpret_cf_line(ln) if (len(countsL) != len(indivL)): raise NotACountsFormatFileError("Line doesn't fit nr. of species.") self.name = name self.chrom = chrom self.pos = pos self.fo = CFFile self.indivL = indivL self.countsL = countsL self.nIndiv = len(countsL)
def init_seq(faFileName, maxskip=50, name=None): """Open a fasta file and initialize an :class:`FaStream`. This function tries to open the given fasta file, checks if it is in fasta format and reads the first sequence. It returns an :class:`FaStream` object. This object can later be used to parse the whole fasta file. Please close the associated file object with :func:`FaStream.close` when you don't need it anymore. :param str faFileName: File name of the fasta file. :param int maxskip: Only look *maxskip* lines for the start of a sequence (defaults to 50). :param str name: Set the name of the sequence to *name*, otherwise set it to the stripped filename. """ flag = False faFile = sb.gz_open(faFileName) if name is None: name = sb.stripFName(faFileName) # Find the start of the first sequence. for i in range(0, maxskip): line = faFile.readline() if line == '': raise NotAFastaFileError("File contains no data.") if line[0] == '>': # species name found in line flag = True break if flag is False: raise NotAFastaFileError("Didn't find a species header within " + maxskip + " lines.") (nextHL, seq) = read_seq_from_fo(line, faFile) try: nextHL = nextHL.rstrip() except: pass faStr = FaStream(name, seq, nextHL, faFile) return faStr
def init_seq(VCFFileName, maxskip=100, name=None): """Open a (gzipped) VCF4.1 file. Try to open the given VCF file, checks if it is in VCF format. Initialize a :class:`VCFStream` object that contains the first base. Please close the associated file object with :func:`VCFStream.close` when you don't need it anymore. :param str VCFFileName: Name of the VCF file. :param int maxskip: Only look *maxskip* lines for the start of the bases (defaults to 80). :param str name: Set the name of the sequence to *name*, otherwise set it to the filename. """ flag = False VCFFile = sb.gz_open(VCFFileName) # Set the vcf sequence name. if name is None: name = sb.stripFName(VCFFileName) # Find the start of the first base. for i in range(0, maxskip): line = VCFFile.readline() if line == '': raise NotAVariantCallFormatFileError("File contains no data.") if line[0:6] == '#CHROM': # Here starts the data. check_fixed_field_header(line) speciesL = get_indiv_from_field_header(line) flag = True break if flag is False: raise NotAVariantCallFormatFileError( "Didn't find any data within " + str(maxskip) + " lines.") line = VCFFile.readline() base = get_nuc_base_from_line(line, info=False) base.set_ploidy() return VCFStream(name, VCFFile, speciesL, base)
parser.add_argument("msaFile", help="path to (gzipped) multiple sequence alignment file") parser.add_argument("nSpecies", help="number of aligned species in the given alignment") parser.add_argument("output", help="name of (gzipped) msa output file") parser.add_argument('-v', "--verbosity", action="count", help="turn on verbosity") args = parser.parse_args() mfaFN = args.msaFile nSpecies = int(args.nSpecies) output = args.output vb = args.verbosity mfa = fa.MFaStream(mfaFN) fp = fa.MFaStrFilterProps(nSpecies) oF = sb.gz_open(output, mode='w') while True: if fa.filter_mfa_str(mfa, fp, vb) is True: mfa.print_msa(fo=oF) if mfa.read_next_align() is None: break oF.close() mfa.close()
def save_as_vcf(faSeq, ref, VCFFileName): """Save the given :classL`FaSeq` in VCF format. In general, we want to convert a fasta file with various individuals with the help of a reference that contains one sequence to a VCF file that contains all the SNPs. This can be done with this function. Until now it is not possible to do this conversion for several chromosomes for each individual in one run. Still, the conversion can be done chromosome by chromosome. This function saves the SNPs of *faSeq*, a given :class:`FaSeq` (fasta sequence) object in VCF format to the file *VCFFileName*. The reference genome *ref*, to which *faSeq* is compared to, needs to be passed as a :class:`Seq <cflib.seqbase.Seq>` object. The function compares all sequences in *faSeq* to the sequence given in *ref*. The names of the individuals in the saved VCF file will be the sequence names of the *faSeq* object. :: #CHROM = sequence name of the reference POS = position relative to reference ID = . REF = base of reference ALT = SNP (e.g. 'C' or 'G,T' if 2 different SNPs are present) QUAL = . FILTER = . INFO = . FORMAT = GT :param FaSeq faSeq: :class:`FaSeq` object to be converted. :param Seq ref: :class:`Seq <cflib.seqbase.Seq>` object of the reference sequence. :param str VCFFileName: Name of the VCF output file. """ def get_altBases_string(sAltBases): """Return ALT bases string from given `sAltBases`.""" length = len(sAltBases) if length == 0: return '' string = str(sAltBases[0]) if length > 1: for i in range(1, length): string += ',' + sAltBases[i] return string def get_indiv_string(indivData, altBases, sAltBases): """Return the string of the individual data. Return the string extracted from the indivudal data `indivData` with SNPs `altBases`. `sAltBases` is the string with the alternative bases. E.g.: REF = A ALT = C,G individual i1 has A individual i2 has C individual i3 has G Then the string should look like: '0\t1\t2' -> 0 for REF, 1 for first ALT and 2 for second ALT """ length = len(indivData) if not (indivData[0] in altBases): string = '0' else: string = str(sAltBases.index(indivData[0]) + 1) if length > 1: for i in range(1, len(indivData)): if not (indivData[i] in altBases): string += '\t' + '0' else: string += '\t' + str(sAltBases.index(indivData[i]) + 1) return string def get_vcf_line(chromName, pos, refBase, altBaseString, indivString): """Print a VCF file line with given data to file `VCFFile`.""" string = chromName + '\t' string += str(pos) + '\t' string += '.' + '\t' # id string += refBase + '\t' string += altBaseString + '\t' string += '.' + '\t' # qual string += '.' + '\t' # filter string += '.' + '\t' # info string += "GT" + '\t' # format string += indivString return string if (not isinstance(faSeq, FaSeq)): raise sb.SequenceDataError("`faSeq` is not an FaSeq object.") if (not isinstance(ref, sb.Seq)): raise sb.SequenceDataError("`ref` is not a Seq object.") if faSeq.nSpecies == 0: raise sb.SequenceDataError("`faSeq` has no saved sequences.") for i in range(0, faSeq.nSpecies): if faSeq.seqL[i].dataLen != ref.dataLen: raise sb.SequenceDataError("Sequence " + faSeq.seqL[i].name + " has different length than reference.") VCFFile = sb.gz_open(VCFFileName, mode='w') print(vcf.get_header_line_string(faSeq.get_seq_names()), file=VCFFile) # loop over bases refBase = '' for i in range(0, ref.dataLen): refBase = ref.data[i] altBases = set() indivData = [] # loop over sequences in faSeq and check if there is a SNP for s in range(0, faSeq.nSpecies): indivData.append(faSeq.seqL[s].data[i]) if faSeq.seqL[s].data[i] != refBase: altBases.add(faSeq.seqL[s].data[i]) sAltBases = sorted(altBases) altBaseString = get_altBases_string(sAltBases) indivString = get_indiv_string(indivData, altBases, sAltBases) if altBases != set(): print(get_vcf_line(ref.name, i + 1, refBase, altBaseString, indivString), file=VCFFile) VCFFile.close() return
def save_as_vcf(faSeq, ref, VCFFileName): """Save the given :classL`FaSeq` in VCF format. In general, we want to convert a fasta file with various individuals with the help of a reference that contains one sequence to a VCF file that contains all the SNPs. This can be done with this function. Until now it is not possible to do this conversion for several chromosomes for each individual in one run. Still, the conversion can be done chromosome by chromosome. This function saves the SNPs of *faSeq*, a given :class:`FaSeq` (fasta sequence) object in VCF format to the file *VCFFileName*. The reference genome *ref*, to which *faSeq* is compared to, needs to be passed as a :class:`Seq <cflib.seqbase.Seq>` object. The function compares all sequences in *faSeq* to the sequence given in *ref*. The names of the individuals in the saved VCF file will be the sequence names of the *faSeq* object. :: #CHROM = sequence name of the reference POS = position relative to reference ID = . REF = base of reference ALT = SNP (e.g. 'C' or 'G,T' if 2 different SNPs are present) QUAL = . FILTER = . INFO = . FORMAT = GT :param FaSeq faSeq: :class:`FaSeq` object to be converted. :param Seq ref: :class:`Seq <cflib.seqbase.Seq>` object of the reference sequence. :param str VCFFileName: Name of the VCF output file. """ def get_altBases_string(sAltBases): """Return ALT bases string from given `sAltBases`.""" l = len(sAltBases) if l == 0: return '' string = str(sAltBases[0]) if l > 1: for i in range(1, l): string += ',' + sAltBases[i] return string def get_indiv_string(indivData, altBases, sAltBases): """Return the string of the individual data. Return the string extracted from the indivudal data `indivData` with SNPs `altBases`. `sAltBases` is the string with the alternative bases. E.g.: REF = A ALT = C,G individual i1 has A individual i2 has C individual i3 has G Then the string should look like: '0\t1\t2' -> 0 for REF, 1 for first ALT and 2 for second ALT """ l = len(indivData) if not (indivData[0] in altBases): string = '0' else: string = str(sAltBases.index(indivData[0]) + 1) if l > 1: for i in range(1, len(indivData)): if not (indivData[i] in altBases): string += '\t' + '0' else: string += '\t' + str(sAltBases.index(indivData[i]) + 1) return string def get_vcf_line(chromName, pos, refBase, altBaseString, indivString): """Print a VCF file line with given data to file `VCFFile`.""" string = chromName + '\t' string += str(pos) + '\t' string += '.' + '\t' # id string += refBase + '\t' string += altBaseString + '\t' string += '.' + '\t' # qual string += '.' + '\t' # filter string += '.' + '\t' # info string += "GT" + '\t' # format string += indivString return string if (not isinstance(faSeq, FaSeq)): raise sb.SequenceDataError("`faSeq` is not an FaSeq object.") if (not isinstance(ref, sb.Seq)): raise sb.SequenceDataError("`ref` is not a Seq object.") if faSeq.nSpecies == 0: raise sb.SequenceDataError("`faSeq` has no saved sequences.") for i in range(0, faSeq.nSpecies): if faSeq.seqL[i].dataLen != ref.dataLen: raise sb.SequenceDataError( "Sequence " + faSeq.seqL[i].name + " has different length than reference.") VCFFile = sb.gz_open(VCFFileName, mode='w') print(vcf.get_header_line_string(faSeq.get_seq_names()), file=VCFFile) # loop over bases refBase = '' for i in range(0, ref.dataLen): refBase = ref.data[i] altBases = set() indivData = [] # loop over sequences in faSeq and check if there is a SNP for s in range(0, faSeq.nSpecies): indivData.append(faSeq.seqL[s].data[i]) if faSeq.seqL[s].data[i] != refBase: altBases.add(faSeq.seqL[s].data[i]) sAltBases = sorted(altBases) altBaseString = get_altBases_string(sAltBases) indivString = get_indiv_string(indivData, altBases, sAltBases) if altBases != set(): print( get_vcf_line(ref.name, i+1, refBase, altBaseString, indivString), file=VCFFile) VCFFile.close() return