def __init__(self, faFileName, maxskip=50, name=None): """Open a fasta file and initialize :class:`MFaStream`.""" def add_instance_variables(name, firstSeqL, nextHL, faFileObject): """Add state objects.""" self.name = name self.seqL = firstSeqL self.nSpecies = len(self.seqL) self.nextHeaderLine = nextHL self.fo = faFileObject flag = False faFile = sb.gz_open(faFileName) if name is None: name = sb.stripFName(faFileName) # Find the start of the first sequence. for i in range(0, maxskip): line = faFile.readline() if line == '': raise NotAFastaFileError("File contains no data.") if line[0] == '>': # species name found in line flag = True break if flag is False: raise NotAFastaFileError("Didn't find a species header within " + maxskip + " lines.") (nextHL, seqL) = read_align_from_fo(line, faFile) try: nextHL = nextHL.rstrip() except: pass add_instance_variables(name, seqL, nextHL, faFile)
def open_seq(VCFFileName, maxskip=100, name=None): """Open a VCF4.1 file. Try to open the given VCF file, checks if it is in VCF format and reads the bases(s). It returns an :class:`VCFSeq` object that contains all the information. :param str VCFFileName: Name of the VCF file. :param int maxskip: Only look *maxskip* lines for the start of the bases (defaults to 80). :param str name: Set the name of the sequence to *name*, otherwise set it to the filename. """ def test_sequence(seq): """ Test a given VCF sequence. TODO: implement this. :param seq: :returns: :rtype: """ pass seq = VCFSeq() seq.header = "" flag = False VCFFile = sb.gz_open(VCFFileName) # set the vcf sequence name if name is not None: seq.name = name else: seq.name = sb.stripFName(VCFFileName) # Find the start of the first base for i in range(0, maxskip): line = VCFFile.readline() if line == '': raise NotAVariantCallFormatFileError("File contains no data.") if line[0:2] == '##': seq.header += line if line[0:6] == '#CHROM': # Here starts the data. check_fixed_field_header(line) seq.speciesL = get_indiv_from_field_header(line) seq.nSpecies = len(seq.speciesL) flag = True break if flag is False: raise NotAVariantCallFormatFileError( "Didn't find any data within " + str(maxskip) + " lines.") for line in VCFFile: base = get_nuc_base_from_line(line) seq.append_nuc_base(base) VCFFile.close() test_sequence(seq) return seq
def open_seq(faFileName, maxskip=50, name=None): """Open and read a fasta file. This function tries to open the given fasta file, checks if it is in fasta format and reads the sequence(s). It returns an :class:`FaSeq` object that contains a list of species names, a list of the respective desriptions and a list with the sequences. :param str faFileName: Name of the fasta file. :param int maxskip: Only look *maxskip* lines for the start of a sequence (defaults to 50). :param str name: Set the name of the sequence to *name* otherwise set it to the stripped filename. """ def test_sequence(faSequence): """Tests if sequences contain data.""" length = faSequence.nSpecies names = [] for i in range(length): names.append(faSequence.seqL[i].name) if faSequence.seqL[i].name == '' or faSequence.seqL[i].data == '': raise sb.SequenceDataError("Sequence name or data is missing.") if length > len(set(names)): raise sb.SequenceDataError("Sequence names are not unique.") return fastaSeq = FaSeq() flag = False faFile = sb.gz_open(faFileName) if name is not None: fastaSeq.name = name else: fastaSeq.name = sb.stripFName(faFileName) # Find the start of the first sequence. for i in range(0, maxskip): line = faFile.readline() if line == '': raise NotAFastaFileError("File contains no data.") if line[0] == '>': # species name found in line flag = True break if flag is False: raise NotAFastaFileError("Didn't find a species header within " + maxskip + " lines.") while line is not None: (nextLine, seq) = read_seq_from_fo(line, faFile) line = nextLine fastaSeq.seqL.append(seq) fastaSeq.nSpecies += 1 faFile.close() test_sequence(fastaSeq) for s in fastaSeq.seqL: fastaSeq.seqD[s.name] = s return fastaSeq
def open_seq(faFileName, maxskip=50, name=None): """Open and read a fasta file. This function tries to open the given fasta file, checks if it is in fasta format and reads the sequence(s). It returns an :class:`FaSeq` object that contains a list of species names, a list of the respective desriptions and a list with the sequences. :param str faFileName: Name of the fasta file. :param int maxskip: Only look *maxskip* lines for the start of a sequence (defaults to 50). :param str name: Set the name of the sequence to *name* otherwise set it to the stripped filename. """ def test_sequence(faSequence): """Tests if sequences contain data.""" l = faSequence.nSpecies names = [] for i in range(l): names.append(faSequence.seqL[i].name) if faSequence.seqL[i].name == '' or faSequence.seqL[i].data == '': raise sb.SequenceDataError("Sequence name or data is missing.") if l > len(set(names)): raise sb.SequenceDataError("Sequence names are not unique.") return fastaSeq = FaSeq() flag = False faFile = sb.gz_open(faFileName) if name is not None: fastaSeq.name = name else: fastaSeq.name = sb.stripFName(faFileName) # Find the start of the first sequence. for i in range(0, maxskip): line = faFile.readline() if line == '': raise NotAFastaFileError("File contains no data.") if line[0] == '>': # species name found in line flag = True break if flag is False: raise NotAFastaFileError("Didn't find a species header within " + maxskip + " lines.") while line is not None: (nextLine, seq) = read_seq_from_fo(line, faFile) line = nextLine fastaSeq.seqL.append(seq) fastaSeq.nSpecies += 1 faFile.close() test_sequence(fastaSeq) for s in fastaSeq.seqL: fastaSeq.seqD[s.name] = s return fastaSeq
def __init__(self, CFFileName, name=None): CFFile = sb.gz_open(CFFileName) # Set the cf sequence name. if name is None: name = sb.stripFName(CFFileName) # Find the start of the first base. ln = CFFile.readline() if ln == '': raise NotACountsFormatFileError("File contains no data.") # Skip comments. while ln[0] == '#': ln = CFFile.readline() # Read in first line. lnL = ln.split() length = len(lnL) if (lnL[0] != "COUNTSFILE") or (length != 5): raise NotACountsFormatFileError("First line is corrupt.") # TODO: The first line is needed by IQ-Tree, but not by # cflib. Maybe I should use this information here! ln = CFFile.readline() # Skip comments. while ln[0] == '#': ln = CFFile.readline() # Read in headerline. lnL = ln.split() length = len(lnL) indivL = [] if (lnL[0] in ["CHROM", "Chrom"]) and (lnL[1] in ["POS", "Pos"]): for i in range(2, length): indivL.append(lnL[i].strip()) else: raise NotACountsFormatFileError("Header line is corrupt.") ln = CFFile.readline() (chrom, pos, countsL) = interpret_cf_line(ln) if (len(countsL) != len(indivL)): raise NotACountsFormatFileError("Line doesn't fit nr. of species.") self.name = name self.chrom = chrom self.pos = pos self.fo = CFFile self.indivL = indivL self.countsL = countsL self.nIndiv = len(countsL)
def __init__(self, CFFileName, name=None): CFFile = sb.gz_open(CFFileName) # Set the cf sequence name. if name is None: name = sb.stripFName(CFFileName) # Find the start of the first base. ln = CFFile.readline() if ln == '': raise NotACountsFormatFileError("File contains no data.") # Skip comments. while ln[0] == '#': ln = CFFile.readline() # Read in first line. lnL = ln.split() l = len(lnL) if (lnL[0] != "COUNTSFILE") or (l != 5): raise NotACountsFormatFileError("First line is corrupt.") # TODO: The first line is needed by IQ-Tree, but not by # cflib. Maybe I should use this information here! ln = CFFile.readline() # Skip comments. while ln[0] == '#': ln = CFFile.readline() # Read in headerline. lnL = ln.split() l = len(lnL) indivL = [] if (lnL[0] in ["CHROM", "Chrom"]) and (lnL[1] in ["POS", "Pos"]): for i in range(2, l): indivL.append(lnL[i].strip()) else: raise NotACountsFormatFileError("Header line is corrupt.") ln = CFFile.readline() (chrom, pos, countsL) = interpret_cf_line(ln) if (len(countsL) != len(indivL)): raise NotACountsFormatFileError("Line doesn't fit nr. of species.") self.name = name self.chrom = chrom self.pos = pos self.fo = CFFile self.indivL = indivL self.countsL = countsL self.nIndiv = len(countsL)
def init_seq(faFileName, maxskip=50, name=None): """Open a fasta file and initialize an :class:`FaStream`. This function tries to open the given fasta file, checks if it is in fasta format and reads the first sequence. It returns an :class:`FaStream` object. This object can later be used to parse the whole fasta file. Please close the associated file object with :func:`FaStream.close` when you don't need it anymore. :param str faFileName: File name of the fasta file. :param int maxskip: Only look *maxskip* lines for the start of a sequence (defaults to 50). :param str name: Set the name of the sequence to *name*, otherwise set it to the stripped filename. """ flag = False faFile = sb.gz_open(faFileName) if name is None: name = sb.stripFName(faFileName) # Find the start of the first sequence. for i in range(0, maxskip): line = faFile.readline() if line == '': raise NotAFastaFileError("File contains no data.") if line[0] == '>': # species name found in line flag = True break if flag is False: raise NotAFastaFileError("Didn't find a species header within " + maxskip + " lines.") (nextHL, seq) = read_seq_from_fo(line, faFile) try: nextHL = nextHL.rstrip() except: pass faStr = FaStream(name, seq, nextHL, faFile) return faStr
def init_seq(VCFFileName, maxskip=100, name=None): """Open a (gzipped) VCF4.1 file. Try to open the given VCF file, checks if it is in VCF format. Initialize a :class:`VCFStream` object that contains the first base. Please close the associated file object with :func:`VCFStream.close` when you don't need it anymore. :param str VCFFileName: Name of the VCF file. :param int maxskip: Only look *maxskip* lines for the start of the bases (defaults to 80). :param str name: Set the name of the sequence to *name*, otherwise set it to the filename. """ flag = False VCFFile = sb.gz_open(VCFFileName) # Set the vcf sequence name. if name is None: name = sb.stripFName(VCFFileName) # Find the start of the first base. for i in range(0, maxskip): line = VCFFile.readline() if line == '': raise NotAVariantCallFormatFileError("File contains no data.") if line[0:6] == '#CHROM': # Here starts the data. check_fixed_field_header(line) speciesL = get_indiv_from_field_header(line) flag = True break if flag is False: raise NotAVariantCallFormatFileError( "Didn't find any data within " + str(maxskip) + " lines.") line = VCFFile.readline() base = get_nuc_base_from_line(line, info=False) base.set_ploidy() return VCFStream(name, VCFFile, speciesL, base)