def concatLoci(self, repID):
     data_true = None
     data = None
     msa = None
     msa_true = None
     nsequence = ""
     if self.nsize > 0:
         nsequence = "".join(["N"] * self.nsize)
     # missing n sequence - stop at concatenating.
     for locID in range(0, self.numLociPerReplicateDigits[repID - 1]):
         inputfile_true=os.path.join(\
          self.path,\
          "{0:0{1}d}".format(repID, self.numReplicatesDigits),\
          "{0}_{1:0{2}d}_TRUE.fasta".format(self.inputprefix,(locID+1), self.numLociPerReplicateDigits[repID-1])\
         )
         inputfile=os.path.join(\
          self.path,\
          "{0:0{1}d}".format(repID, self.numReplicatesDigits),\
          "{0}_{1:0{2}d}.fasta".format(self.inputprefix,(locID+1), self.numLociPerReplicateDigits[repID-1])\
         )
         if locID == 0:
             data = msatools.parseMSAFileWithDescriptions(inputfile)
             data_true = msatools.parseMSAFileWithDescriptions(
                 inputfile_true)
         else:
             msa = msatools.parseMSAFileWithDescriptions(inputfile)
             msa_true = msatools.parseMSAFileWithDescriptions(
                 inputfile_true)
             for key in msa.keys():
                 data[key] = "{}{}{}".format(data[key], nsequence, msa[key])
                 data_true[key] = "{}{}{}".format(data_true[key], nsequence,
                                                  msa_true[key])
     self.writemsainplace(repID, "fasta", data)
     self.writemsainplace(repID, "true", data_true)
Esempio n. 2
0
	def correctContentReferenceSequence(self):
		"""
		Verifies content of the reference sequence file for nucleotides only.
		------------------------------------------------------------------------
		Returns:
		- boolean, message: the status of the process and the message related to
		such status
		"""
		self.appLogger.debug("checking content of the refernece sequence")
		status=True; message=""
		# need to read the first line of the file to get the description:
		f=open(self.ancestralSequenceFilePath, "r")
		description=f.readline().strip()
		f.close()
		description=description[1:len(description)]
		referenceDict=msatools.parseMSAFileWithDescriptions(self.ancestralSequenceFilePath)
		reference=referenceDict[description]
		for item in reference:
			if not item.upper() in self.__NUCLEOTIDES:
				status=False
				message="\n\t{0}{1}{2}\n\t{3}\n\t{4}".format(
					"[data] block: Input mode (",self.inputmode,") selected but invalid option.",\
					"Reference sequence should be a nucleotidic sequence, but some other characters exist..",\
					"Please verify. Exiting."\
					)
				break
		return status, message
Esempio n. 3
0
	def methodConsensusAll(self,index):
		"""
		Method 3 for the selection of reference loci.
		------------------------------------------------------------------------
		Computes the consensus from all the sequences of a gene tree file,
		and uses this sequence as reference loci.
		Args: repID: Index of the species tree that is being used.
		Returns: Nothing
		"""
		repID=index+1
		sequenceListIndex=sum(self.numLociPerReplicate[0:index])
		if repID==1: sequenceListIndex=0
		for locID in range(1,self.numLociPerReplicate[index]+1):
			APPLOGGER.info("Locus {0}/{1} | Table Index: {2}".format(locID,self.numLociPerReplicate[index], sequenceListIndex))
			fastapath=os.path.join(\
				self.path,\
				"{0:0{1}d}".format(repID, self.numReplicatesDigits),\
				"{0}_{1:0{2}d}.fasta".format(self.inputprefix,locID, self.numLociPerReplicateDigits[index])\
			)
			lociData=msatools.parseMSAFileWithDescriptions(fastapath)
			keys=set(lociData.keys())
			sequences=[]
			for mk in keys:
				sequences+=[lociData[mk]]
			selected=self.computeConsensus(sequences)
			self.writeLocus(index,locID,">consensus_all",selected)
			mytuple=list(self.sequenceList[sequenceListIndex])
			mytuple[2]="{0}_CONSENSUS_ALL".format(rndKey1)
			self.sequenceList[sequenceListIndex]=tuple(mytuple)
			sequenceListIndex+=1
		APPLOGGER.info("Done all ingroups consensus")
Esempio n. 4
0
	def seqPerLocus(self,index):
		"""
		Method 1 for the selection of reference loci.
		------------------------------------------------------------------------
		This method selects a sequence per locus as indicated in the seq_desc_file.
		Args: repID: Index of the species tree that is being used.
		Returns: Nothing
		"""
		entries=self.parseReferenceLociFile(self.seqDescriptionFile)
		for entry in entries:
			repID=entry[0]
			locID=entry[1]
			seqID=entry[2]
			APPLOGGER.info("Locus {0}/{1}".format(locID,self.numLociPerReplicate[index]))
			fastapath=os.path.join(
				self.path,\
				"{0:0{1}d}".format(repID, self.numReplicatesDigits),\
				"{0}_{1:0{2}d}_TRUE.fasta".format(self.inputprefix,locID,self.numLociPerReplicateDigits[index]),\
			)
			fastaFile=msatools.parseMSAFileWithDescriptions(fastapath)
			sequence=""
			try:
				sequence=fastaFile[seqID]
			except:
				message="{0}\n\t{1}".format(\
					"One of the selected sequences (description) has not been found on this file.",\
					"Please verify. Exiting"
				)
				raise NRSException(False, message, datetime.datetime.now()-self.startTime)

			self.writeLocus(index,locID,seqID,sequence)
		APPLOGGER.info("Done Seq Per locus")
Esempio n. 5
0
	def methodOutgroup(self,index):
		"""
		Method 0 for the selection of reference loci.
		This method selects the outgroup as a reference locus.
		------------------------------------------------------------------------
		attributes: repID: Index of the species tree that is being used.
		returns: Nothing
		"""
		APPLOGGER.debug("method outgroup")
		description="0_0_0"
		repID=index+1
		sequenceListIndex=sum(self.numLociPerReplicate[0:index])
		if index==0: sequenceListIndex=0
		for locID in range(1,self.numLociPerReplicate[index]+1):
			APPLOGGER.info("Locus {0}/{1} | Table Index: {2}".format(locID,self.numLociPerReplicate[index], sequenceListIndex))
			mytuple=list(self.sequenceList[sequenceListIndex])
			mytuple[2]=description
			self.sequenceList[sequenceListIndex]=tuple(mytuple)
			fastapath=os.path.join(\
				self.path,\
				"{0:0{1}d}".format(repID, self.numReplicatesDigits),\
				"{0}_{1:0{2}d}.fasta".format(self.inputprefix,locID, self.numLociPerReplicateDigits[index])\
			)
			lociData=msatools.parseMSAFileWithDescriptions(fastapath)
			selectedSequence=lociData[description]
			self.writeLocus(index,locID,description,selectedSequence)
			sequenceListIndex+=1
		APPLOGGER.info("Done outgroup sequence")
Esempio n. 6
0
	def __init__(self, cmdArgs):
		self.appLogger=logging.getLogger("gac-msa")
		self.startime=datetime.datetime.now()
		self.appLogger.info("Start...")
		self.inputfile=os.path.abspath(cmdArgs.input)
		self.outputfile=os.path.abspath(cmdArgs.output)
		if not os.path.exists(self.inputfile):
			raise GACMSAException(False, "Input file does not exist.\nPlease Verify.", datetime.datetime.now()-self.startime)
		if not os.path.exists(os.path.dirname(self.outputfile)):
			raise GACMSAException(False, "Path of the output file does not exist.\nPlease Verify.", datetime.datetime.now()-self.startime)
		self.appLogger.info("Parsing MSA file: {}".format(self.inputfile))
		self.msa=msatools.parseMSAFileWithDescriptions(self.inputfile)
Esempio n. 7
0
    def copyAncestralSequenceToOutputFolder(self):
        """
		In order to generate genome sequences all the required files must be
		in the same folder where INDELible is going to be ran. Hence, the need
		of copying the given reference file to the directory where data will
		be stored.
		-----------------------------------------------------------------------
		Returns:
		- boolean. Indicates the status of the process.
		"""
        # making sure there's only one sequence, and only one sequence will be written to the
        # reference.fasta file
        # that sequence will be the first from the file if there are more than 1 sequence
        status = True
        message = ""
        self.appLogger.debug("Copying reference sequence file ")
        self.appLogger.info("Copying reference sequence file to: {}".format(\
         self.newIndelibleAncestralSequence))
        description = ""
        try:
            with open(self.settings.ancestralSequenceFilePath, "r") as f:
                description = f.readline().strip()
        except Exception as ex:
            message="\n\t{0}\n\t{1}\n\t{2}\n\t{3}\n".format(\
             "I/O problem.",\
             ex,
             "Stopped while reading the ancestral sequence file.",\
             "Please verify and rerun. Exiting."
            )
            status = False
            return status, message
        description = description[1:len(description)]
        referenceDict = msatools.parseMSAFileWithDescriptions(
            self.settings.ancestralSequenceFilePath)
        reference = referenceDict[description]
        try:
            fout = open(self.newIndelibleAncestralSequence, "w")
            fout.write(">ngsphypartition\n{}\n".format(reference))
            fout.close()
        except Exception as ex:
            message="\n\t{0}\n\t{1}\n\t{2}\n\t{3}\n".format(\
             "I/O problem.",\
             ex,\
             "Stopped while copying the ancestral sequence file.",\
             "Please verify and rerun. Exiting."
            )
            status = False
            return status, message
        return status, message
Esempio n. 8
0
 def __init__(self, cmdArgs):
     self.appLogger = logging.getLogger("gac-msa")
     self.startime = datetime.datetime.now()
     self.appLogger.info("Start...")
     self.inputfile = os.path.abspath(cmdArgs.input)
     self.outputfile = os.path.abspath(cmdArgs.output)
     if not os.path.exists(self.inputfile):
         raise GACMSAException(
             False, "Input file does not exist.\nPlease Verify.",
             datetime.datetime.now() - self.startime)
     if not os.path.exists(os.path.dirname(self.outputfile)):
         raise GACMSAException(
             False,
             "Path of the output file does not exist.\nPlease Verify.",
             datetime.datetime.now() - self.startime)
     self.appLogger.info("Parsing MSA file: {}".format(self.inputfile))
     self.msa = msatools.parseMSAFileWithDescriptions(self.inputfile)